void oneoff(char *cfgfile, char *weightfile, char *outfile) { gpu_index = -1; network *net = parse_network_cfg(cfgfile); int oldn = net->layers[net->n - 2].n; int c = net->layers[net->n - 2].c; scal_cpu(oldn*c, .1, net->layers[net->n - 2].weights, 1); scal_cpu(oldn, 0, net->layers[net->n - 2].biases, 1); net->layers[net->n - 2].n = 11921; net->layers[net->n - 2].biases += 5; net->layers[net->n - 2].weights += 5*c; if(weightfile){ load_weights(net, weightfile); } net->layers[net->n - 2].biases -= 5; net->layers[net->n - 2].weights -= 5*c; net->layers[net->n - 2].n = oldn; printf("%d\n", oldn); layer l = net->layers[net->n - 2]; copy_cpu(l.n/3, l.biases, 1, l.biases + l.n/3, 1); copy_cpu(l.n/3, l.biases, 1, l.biases + 2*l.n/3, 1); copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + l.n/3*l.c, 1); copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + 2*l.n/3*l.c, 1); *net->seen = 0; save_weights(net, outfile); free_network(net); }
void forward_batchnorm_layer(layer l, network_state state) { if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); if(l.type == CONNECTED){ l.out_c = l.outputs; l.out_h = l.out_w = 1; } if(state.train){ mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean); variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance); scal_cpu(l.out_c, .99, l.rolling_mean, 1); axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1); scal_cpu(l.out_c, .99, l.rolling_variance, 1); axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w); } scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w); add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w); }
void forward_connected_layer(connected_layer l, network_state state) { int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.batch; int k = l.inputs; int n = l.outputs; float *a = state.input; float *b = l.weights; float *c = l.output; gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(l.batch_normalize){ if(state.train){ mean_cpu(l.output, l.batch, l.outputs, 1, l.mean); variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance); scal_cpu(l.outputs, .95, l.rolling_mean, 1); axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1); scal_cpu(l.outputs, .95, l.rolling_variance, 1); axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1); } scale_bias(l.output, l.scales, l.batch, l.outputs, 1); } for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1); } activate_array(l.output, l.outputs*l.batch, l.activation); }
void backward_rnn_layer(layer l, network_state state) { network_state s = { 0 }; s.train = state.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); increment_layer(&input_layer, l.steps - 1); increment_layer(&self_layer, l.steps - 1); increment_layer(&output_layer, l.steps - 1); l.state += l.hidden * l.batch * l.steps; for (i = l.steps - 1; i >= 0; --i) { copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1); axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1); s.input = l.state; s.delta = self_layer.delta; backward_connected_layer(output_layer, s); l.state -= l.hidden * l.batch; /* if(i > 0){ copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1); axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1); }else{ fill_cpu(l.hidden * l.batch, 0, l.state, 1); } */ s.input = l.state; s.delta = self_layer.delta - l.hidden * l.batch; if (i == 0) s.delta = 0; backward_connected_layer(self_layer, s); copy_cpu(l.hidden * l.batch, self_layer.delta, 1, input_layer.delta, 1); if (i > 0 && l.shortcut) axpy_cpu(l.hidden * l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden * l.batch, 1); s.input = state.input + i * l.inputs * l.batch; if (state.delta) s.delta = state.delta + i * l.inputs * l.batch; else s.delta = 0; backward_connected_layer(input_layer, s); increment_layer(&input_layer, -1); increment_layer(&self_layer, -1); increment_layer(&output_layer, -1); } }
void forward_local_layer(const local_layer l, network_state state) { int out_h = local_out_height(l); int out_w = local_out_width(l); int i, j; int locations = out_h * out_w; for (i = 0; i < l.batch; ++i) { copy_cpu(l.outputs, l.biases, 1, l.output + i * l.outputs, 1); } for (i = 0; i < l.batch; ++i) { float *input = state.input + i * l.w * l.h * l.c; im2col_cpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image); float *output = l.output + i * l.outputs; for (j = 0; j < locations; ++j) { float *a = l.weights + j * l.size * l.size * l.c * l.n; float *b = l.col_image + j; float *c = output + j; int m = l.n; int n = 1; int k = l.size * l.size * l.c; gemm(0, 0, m, n, k, 1, a, k, b, locations, 1, c, locations); } } activate_array(l.output, l.outputs * l.batch, l.activation); }
void predict_move(network net, float *board, float *move, int multi) { float *output = network_predict(net, board); copy_cpu(19*19, output, 1, move, 1); int i; if(multi){ image bim = float_to_image(19, 19, 1, board); for(i = 1; i < 8; ++i){ rotate_image_cw(bim, i); if(i >= 4) flip_image(bim); float *output = network_predict(net, board); image oim = float_to_image(19, 19, 1, output); if(i >= 4) flip_image(oim); rotate_image_cw(oim, -i); axpy_cpu(19*19, 1, output, 1, move, 1); if(i >= 4) flip_image(bim); rotate_image_cw(bim, -i); } scal_cpu(19*19, 1./8., move, 1); } for(i = 0; i < 19*19; ++i){ if(board[i]) move[i] = 0; } }
int main() { int N, N2; printf(" \n Input matrix size N x N, N = "); scanf("%d", &N); printf(" N = %d \n \n", N); N2 = N*N; double *A, *B, *C_cpu, *C_gpu, *D_cpu, *D_gpu, t1, t2, cpu_time, gpu_time; double r_cpu, *r_gpu, nrmC_cpu, *nrmC_gpu; A = (double *) malloc(N2*sizeof(double)); B = (double *) malloc(N2*sizeof(double)); C_cpu = (double *) malloc(N2*sizeof(double)); C_gpu = (double *) malloc(N2*sizeof(double)); D_cpu = (double *) malloc(N2*sizeof(double)); D_gpu = (double *) malloc(N2*sizeof(double)); r_gpu = (double *) malloc(1*sizeof(double)); nrmC_gpu = (double *) malloc(1*sizeof(double)); initial(A, B, N); t1 = clock(); #pragma acc data copyin(A[0:N2], B[0:N2]) copyout(C_cpu[0:N2]) { cublas_gemm(A, B, C_cpu, N); } r_cpu = dot_cpu(C_cpu, B, N2); axpy_cpu(-1.0*r_cpu, B, C_cpu, N2); nrmC_cpu = norm_cpu(C_cpu, N2); copy_cpu(C_cpu, D_cpu, N2); scal_cpu(1.0/nrmC_cpu, D_cpu, N2); t2 = clock(); cpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC; t1 = clock(); #pragma acc enter data copyin(A[0:N2], B[0:N2]) create(C_gpu[0:N2], r_gpu[0], nrmC_gpu[0], D_gpu[0:N2]) { gpu_cublas1(A, B, C_gpu, D_gpu, r_gpu, nrmC_gpu, N, N2); } #pragma acc update host(D_gpu[0:N2]) t2 = clock(); gpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC; printf(" gpu part success \n"); printf(" \n error = %f \n", error(D_cpu, D_gpu, N2)); printf(" gpu time = %f, cpu times = %f \n", gpu_time, cpu_time); return 0; }
void backward_batchnorm_layer(const layer l, network_state state) { backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates); scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w); mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta); variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta); normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta); if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1); }
void forward_route_layer(const route_layer l, network_state state) { int i, j; int offset = 0; for(i = 0; i < l.n; ++i){ int index = l.input_layers[i]; float *input = state.net.layers[index].output; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ copy_cpu(input_size, input + j*input_size, 1, l.output + offset + j*l.outputs, 1); } offset += input_size; } }
void backward_route_layer(const route_layer l, network net) { int i, j; int offset = 0; for(i = 0; i < l.n; ++i){ int index = l.input_layers[i]; float *delta = net.layers[index].delta; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ copy_cpu(input_size, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1); } offset += input_size; } }
void forward_cost_layer(cost_layer l, network_state state) { if (!state.truth) return; if(l.cost_type == MASKED){ int i; for(i = 0; i < l.batch*l.inputs; ++i){ if(state.truth[i] == 0) state.input[i] = 0; } } copy_cpu(l.batch*l.inputs, state.truth, 1, l.delta, 1); axpy_cpu(l.batch*l.inputs, -1, state.input, 1, l.delta, 1); *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); //printf("cost: %f\n", *l.output); }
void forward_connected_layer(connected_layer l, network_state state) { int i; for(i = 0; i < l.batch; ++i){ copy_cpu(l.outputs, l.biases, 1, l.output + i*l.outputs, 1); } int m = l.batch; int k = l.inputs; int n = l.outputs; float *a = state.input; float *b = l.weights; float *c = l.output; gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); activate_array(l.output, l.outputs*l.batch, l.activation); }
void forward_compact_layer(const layer l, network_state state) { int i, b; for (b=0;b<l.batch;b++) { // copy first section copy_cpu(l.outputs, state.input+b*l.inputs, 1, l.output+b*l.outputs, 1); // add other splits for (i=1;i<l.index;i++) { axpy_cpu(l.outputs, 1, state.input+b*l.inputs+i*l.outputs, 1, l.output+b*l.outputs, 1); } } activate_array(l.output, l.outputs*l.batch, l.activation); }
void forward_cost_layer(cost_layer l, network_state state) { if (!state.truth) return; if(l.cost_type == MASKED){ int i; for(i = 0; i < l.batch*l.inputs; ++i){ if(state.truth[i] == SECRET_NUM) state.input[i] = SECRET_NUM; } } if(l.cost_type == SMOOTH){ smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta); } else { copy_cpu(l.batch*l.inputs, state.truth, 1, l.delta, 1); axpy_cpu(l.batch*l.inputs, -1, state.input, 1, l.delta, 1); } *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); //printf("cost: %f\n", *l.output); }
void forward_rnn_layer(layer l, network_state state) { network_state s = { 0 }; s.train = state.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1); if (state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1); for (i = 0; i < l.steps; ++i) { s.input = state.input; forward_connected_layer(input_layer, s); s.input = l.state; forward_connected_layer(self_layer, s); float *old_state = l.state; if (state.train) l.state += l.hidden * l.batch; if (l.shortcut) { copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1); } else { fill_cpu(l.hidden * l.batch, 0, l.state, 1); } axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1); axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1); s.input = l.state; forward_connected_layer(output_layer, s); state.input += l.inputs * l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } }
void forward_lstm_layer(layer l, network state) { network s = { 0 }; s.train = state.train; int i; layer wf = *(l.wf); layer wi = *(l.wi); layer wg = *(l.wg); layer wo = *(l.wo); layer uf = *(l.uf); layer ui = *(l.ui); layer ug = *(l.ug); layer uo = *(l.uo); fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1); if (state.train) { fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1); } for (i = 0; i < l.steps; ++i) { s.input = l.h_cpu; forward_connected_layer(wf, s); forward_connected_layer(wi, s); forward_connected_layer(wg, s); forward_connected_layer(wo, s); s.input = state.input; forward_connected_layer(uf, s); forward_connected_layer(ui, s); forward_connected_layer(ug, s); forward_connected_layer(uo, s); copy_cpu(l.outputs * l.batch, wf.output, 1, l.f_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uf.output, 1, l.f_cpu, 1); copy_cpu(l.outputs * l.batch, wi.output, 1, l.i_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ui.output, 1, l.i_cpu, 1); copy_cpu(l.outputs * l.batch, wg.output, 1, l.g_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ug.output, 1, l.g_cpu, 1); copy_cpu(l.outputs * l.batch, wo.output, 1, l.o_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uo.output, 1, l.o_cpu, 1); activate_array(l.f_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.i_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.g_cpu, l.outputs * l.batch, TANH); activate_array(l.o_cpu, l.outputs * l.batch, LOGISTIC); copy_cpu(l.outputs * l.batch, l.i_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.g_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.f_cpu, 1, l.c_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.h_cpu, 1); activate_array(l.h_cpu, l.outputs * l.batch, TANH); mul_cpu(l.outputs * l.batch, l.o_cpu, 1, l.h_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.cell_cpu, 1); copy_cpu(l.outputs * l.batch, l.h_cpu, 1, l.output, 1); state.input += l.inputs * l.batch; l.output += l.outputs * l.batch; l.cell_cpu += l.outputs * l.batch; increment_layer(&wf, 1); increment_layer(&wi, 1); increment_layer(&wg, 1); increment_layer(&wo, 1); increment_layer(&uf, 1); increment_layer(&ui, 1); increment_layer(&ug, 1); increment_layer(&uo, 1); } }
void optimize_picture(network *net, image orig, int max_layer, float scale, float rate, float thresh, int norm) { scale_image(orig, 2); translate_image(orig, -1); net->n = max_layer + 1; int dx = rand()%16 - 8; int dy = rand()%16 - 8; int flip = rand()%2; image crop = crop_image(orig, dx, dy, orig.w, orig.h); image im = resize_image(crop, (int)(orig.w * scale), (int)(orig.h * scale)); if(flip) flip_image(im); resize_network(net, im.w, im.h); layer last = net->layers[net->n-1]; //net->layers[net->n - 1].activation = LINEAR; image delta = make_image(im.w, im.h, im.c); network_state state = {0}; #ifdef GPU state.input = cuda_make_array(im.data, im.w*im.h*im.c); state.delta = cuda_make_array(im.data, im.w*im.h*im.c); forward_network_gpu(*net, state); copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1); cuda_pull_array(last.delta_gpu, last.delta, last.outputs); calculate_loss(last.delta, last.delta, last.outputs, thresh); cuda_push_array(last.delta_gpu, last.delta, last.outputs); backward_network_gpu(*net, state); cuda_pull_array(state.delta, delta.data, im.w*im.h*im.c); cuda_free(state.input); cuda_free(state.delta); #else state.input = im.data; state.delta = delta.data; forward_network(*net, state); copy_cpu(last.outputs, last.output, 1, last.delta, 1); calculate_loss(last.output, last.delta, last.outputs, thresh); backward_network(*net, state); #endif if(flip) flip_image(delta); //normalize_array(delta.data, delta.w*delta.h*delta.c); image resized = resize_image(delta, orig.w, orig.h); image out = crop_image(resized, -dx, -dy, orig.w, orig.h); /* image g = grayscale_image(out); free_image(out); out = g; */ //rate = rate / abs_mean(out.data, out.w*out.h*out.c); if(norm) normalize_array(out.data, out.w*out.h*out.c); axpy_cpu(orig.w*orig.h*orig.c, rate, out.data, 1, orig.data, 1); /* normalize_array(orig.data, orig.w*orig.h*orig.c); scale_image(orig, sqrt(var)); translate_image(orig, mean); */ translate_image(orig, 1); scale_image(orig, .5); //normalize_image(orig); constrain_image(orig); free_image(crop); free_image(im); free_image(delta); free_image(resized); free_image(out); }
void run_nightmare(int argc, char **argv) { srand(0); if(argc < 4){ fprintf(stderr, "usage: %s %s [cfg] [weights] [image] [layer] [options! (optional)]\n", argv[0], argv[1]); return; } char *cfg = argv[2]; char *weights = argv[3]; char *input = argv[4]; int max_layer = atoi(argv[5]); int range = find_int_arg(argc, argv, "-range", 1); int norm = find_int_arg(argc, argv, "-norm", 1); int rounds = find_int_arg(argc, argv, "-rounds", 1); int iters = find_int_arg(argc, argv, "-iters", 10); int octaves = find_int_arg(argc, argv, "-octaves", 4); float zoom = find_float_arg(argc, argv, "-zoom", 1.); float rate = find_float_arg(argc, argv, "-rate", .04); float thresh = find_float_arg(argc, argv, "-thresh", 1.); float rotate = find_float_arg(argc, argv, "-rotate", 0); float momentum = find_float_arg(argc, argv, "-momentum", .9); float lambda = find_float_arg(argc, argv, "-lambda", .01); char *prefix = find_char_arg(argc, argv, "-prefix", 0); int reconstruct = find_arg(argc, argv, "-reconstruct"); int smooth_size = find_int_arg(argc, argv, "-smooth", 1); network net = parse_network_cfg(cfg); load_weights(&net, weights); char *cfgbase = basecfg(cfg); char *imbase = basecfg(input); set_batch_network(&net, 1); image im = load_image_color(input, 0, 0); if(0){ float scale = 1; if(im.w > 512 || im.h > 512){ if(im.w > im.h) scale = 512.0/im.w; else scale = 512.0/im.h; } image resized = resize_image(im, scale*im.w, scale*im.h); free_image(im); im = resized; } float *features; image update; if (reconstruct){ resize_network(&net, im.w, im.h); int size = get_network_output_size(net); features = calloc(size, sizeof(float)); float *out = network_predict(net, im.data); copy_cpu(size, out, 1, features, 1); free_image(im); im = make_random_image(im.w, im.h, im.c); update = make_image(im.w, im.h, im.c); } int e; int n; for(e = 0; e < rounds; ++e){ fprintf(stderr, "Iteration: "); fflush(stderr); for(n = 0; n < iters; ++n){ fprintf(stderr, "%d, ", n); fflush(stderr); if(reconstruct){ reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size); show_image(im, "reconstruction"); #ifdef OPENCV cvWaitKey(10); #endif }else{ int layer = max_layer + rand()%range - range/2; int octave = rand()%octaves; optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm); } } fprintf(stderr, "done\n"); if(0){ image g = grayscale_image(im); free_image(im); im = g; } char buff[256]; if (prefix){ sprintf(buff, "%s/%s_%s_%d_%06d",prefix, imbase, cfgbase, max_layer, e); }else{ sprintf(buff, "%s_%s_%d_%06d",imbase, cfgbase, max_layer, e); } printf("%d %s\n", e, buff); save_image(im, buff); //show_image(im, buff); //cvWaitKey(0); if(rotate){ image rot = rotate_image(im, rotate); free_image(im); im = rot; } image crop = crop_image(im, im.w * (1. - zoom)/2., im.h * (1.-zoom)/2., im.w*zoom, im.h*zoom); image resized = resize_image(crop, im.w, im.h); free_image(im); free_image(crop); im = resized; } }
void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfile, int clear) { #ifdef GPU char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; char *backup_directory = "/home/pjreddie/backup/"; srand(time(0)); char *base = basecfg(cfgfile); printf("%s\n", base); network net = parse_network_cfg(cfgfile); if(weightfile){ load_weights(&net, weightfile); } if(clear) *net.seen = 0; char *abase = basecfg(acfgfile); network anet = parse_network_cfg(acfgfile); if(aweightfile){ load_weights(&anet, aweightfile); } if(clear) *anet.seen = 0; int i, j, k; layer imlayer = {}; for (i = 0; i < net.n; ++i) { if (net.layers[i].out_c == 3) { imlayer = net.layers[i]; break; } } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int imgs = net.batch*net.subdivisions; i = *net.seen/imgs; data train, buffer; list *plist = get_paths(train_images); //int N = plist->size; char **paths = (char **)list_to_array(plist); load_args args = {}; args.w = net.w; args.h = net.h; args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.min = net.min_crop; args.max = net.max_crop; args.angle = net.angle; args.aspect = net.aspect; args.exposure = net.exposure; args.saturation = net.saturation; args.hue = net.hue; args.size = net.w; args.type = CLASSIFICATION_DATA; args.classes = 1; char *ls[1] = {"coco"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; network_state gstate = {}; gstate.index = 0; gstate.net = net; int x_size = get_network_input_size(net)*net.batch; int y_size = 1*net.batch; gstate.input = cuda_make_array(0, x_size); gstate.truth = 0; gstate.delta = 0; gstate.train = 1; float *X = (float*)calloc(x_size, sizeof(float)); float *y = (float*)calloc(y_size, sizeof(float)); network_state astate = {}; astate.index = 0; astate.net = anet; int ay_size = get_network_output_size(anet)*anet.batch; astate.input = 0; astate.truth = 0; astate.delta = 0; astate.train = 1; float *imerror = cuda_make_array(0, imlayer.outputs); float *ones_gpu = cuda_make_array(0, ay_size); fill_ongpu(ay_size, 1, ones_gpu, 1); float aloss_avg = -1; float gloss_avg = -1; //data generated = copy_data(train); while (get_current_batch(net) < net.max_batches) { i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data generated = copy_data(train); time=clock(); float gloss = 0; for(j = 0; j < net.subdivisions; ++j){ get_next_batch(train, net.batch, j*net.batch, X, y); cuda_push_array(gstate.input, X, x_size); *net.seen += net.batch; forward_network_gpu(net, gstate); fill_ongpu(imlayer.outputs, 0, imerror, 1); astate.input = imlayer.output_gpu; astate.delta = imerror; astate.truth = ones_gpu; forward_network_gpu(anet, astate); backward_network_gpu(anet, astate); scal_ongpu(imlayer.outputs, 1, imerror, 1); axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1); backward_network_gpu(net, gstate); printf("features %f\n", cuda_mag_array(imlayer.delta_gpu, imlayer.outputs)); printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs)); gloss += get_network_cost(net) /(net.subdivisions*net.batch); cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); for(k = 0; k < net.batch; ++k){ int index = j*net.batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1); generated.y.vals[index][0] = 0; } } harmless_update_network_gpu(anet); data merge = concat_data(train, generated); randomize_data(merge); float aloss = train_network(anet, merge); update_network_gpu(net); update_network_gpu(anet); free_data(merge); free_data(train); free_data(generated); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; gloss_avg = gloss_avg*.9 + gloss*.1; printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs); if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(net, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(net, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(net, buff); #endif }
void forward_l2norm_layer(const layer l, network * net) { copy_cpu(l.outputs*l.batch, net->input, 1, l.output, 1); l2normalize_cpu(l.output, l.scales, l.batch, l.out_c, l.out_w*l.out_h); }
void forward_activation_layer(layer l, network net) { copy_cpu(l.outputs * l.batch, net.input, 1, l.output, 1); activate_array(l.output, l.outputs * l.batch, l.activation); }
void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display) { #ifdef GPU //char *train_images = "/home/kunle12/data/coco/train1.txt"; //char *train_images = "/home/kunle12/data/coco/trainvalno5k.txt"; char *train_images = "/home/kunle12/data/imagenet/imagenet1k.train.list"; char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *net = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); int i, j, k; layer imlayer = {0}; for (i = 0; i < net->n; ++i) { if (net->layers[i].out_c == 3) { imlayer = net->layers[i]; break; } } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); int imgs = net->batch*net->subdivisions; i = *net->seen/imgs; data train, buffer; list *plist = get_paths(train_images); //int N = plist->size; char **paths = (char **)list_to_array(plist); load_args args= get_base_args(net); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.classes = 1; char *ls[2] = {"imagenet"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; int x_size = net->inputs*net->batch; //int y_size = x_size; net->delta = 0; net->train = 1; float *pixs = calloc(x_size, sizeof(float)); float *graypixs = calloc(x_size, sizeof(float)); //float *y = calloc(y_size, sizeof(float)); //int ay_size = anet->outputs*anet->batch; anet->delta = 0; anet->train = 1; float *imerror = cuda_make_array(0, imlayer.outputs*imlayer.batch); float aloss_avg = -1; float gloss_avg = -1; //data generated = copy_data(train); while (get_current_batch(net) < net->max_batches) { i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gray = copy_data(train); for(j = 0; j < imgs; ++j){ image gim = float_to_image(net->w, net->h, net->c, gray.X.vals[j]); grayscale_image_3c(gim); train.y.vals[j][0] = .95; gray.y.vals[j][0] = .05; } time=clock(); float gloss = 0; for(j = 0; j < net->subdivisions; ++j){ get_next_batch(train, net->batch, j*net->batch, pixs, 0); get_next_batch(gray, net->batch, j*net->batch, graypixs, 0); cuda_push_array(net->input_gpu, graypixs, net->inputs*net->batch); cuda_push_array(net->truth_gpu, pixs, net->truths*net->batch); /* image origi = float_to_image(net->w, net->h, 3, pixs); image grayi = float_to_image(net->w, net->h, 3, graypixs); show_image(grayi, "gray"); show_image(origi, "orig"); cvWaitKey(0); */ *net->seen += net->batch; forward_network_gpu(net); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); copy_gpu(anet->inputs*anet->batch, imlayer.output_gpu, 1, anet->input_gpu, 1); fill_gpu(anet->inputs*anet->batch, .95, anet->truth_gpu, 1); anet->delta_gpu = imerror; forward_network_gpu(anet); backward_network_gpu(anet); scal_gpu(imlayer.outputs*imlayer.batch, 1./100., net->layers[net->n-1].delta_gpu, 1); scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); printf("features %f\n", cuda_mag_array(net->layers[net->n-1].delta_gpu, imlayer.outputs*imlayer.batch)); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net->layers[net->n-1].delta_gpu, 1); backward_network_gpu(net); gloss += *net->cost /(net->subdivisions*net->batch); for(k = 0; k < net->batch; ++k){ int index = j*net->batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gray); //randomize_data(merge); float aloss = train_network(anet, merge); update_network_gpu(net); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gray.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); } #endif free_data(merge); free_data(train); free_data(gray); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; gloss_avg = gloss_avg*.9 + gloss*.1; printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs); if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(net, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(net, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(net, buff); #endif }
void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch) { #ifdef GPU char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *gnet = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); //float orig_rate = anet->learning_rate; int i, j, k; layer imlayer = {0}; for (i = 0; i < gnet->n; ++i) { if (gnet->layers[i].out_c == 3) { imlayer = gnet->layers[i]; break; } } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay); int imgs = gnet->batch*gnet->subdivisions; i = *gnet->seen/imgs; data train, buffer; list *plist = get_paths(train_images); //int N = plist->size; char **paths = (char **)list_to_array(plist); load_args args= get_base_args(anet); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.threads=16; args.classes = 1; char *ls[2] = {"imagenet", "zzzzzzzz"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; gnet->train = 1; anet->train = 1; int x_size = gnet->inputs*gnet->batch; int y_size = gnet->truths*gnet->batch; float *imerror = cuda_make_array(0, y_size); //int ay_size = anet->truths*anet->batch; float aloss_avg = -1; //data generated = copy_data(train); if (maxbatch == 0) maxbatch = gnet->max_batches; while (get_current_batch(gnet) < maxbatch) { i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; //translate_data_rows(train, -.5); //scale_data_rows(train, 2); load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gen = copy_data(train); for (j = 0; j < imgs; ++j) { train.y.vals[j][0] = 1; gen.y.vals[j][0] = 0; } time=clock(); for(j = 0; j < gnet->subdivisions; ++j){ get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0); int z; for(z = 0; z < x_size; ++z){ gnet->input[z] = rand_normal(); } for(z = 0; z < gnet->batch; ++z){ float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs); scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag); } /* for(z = 0; z < 100; ++z){ printf("%f, ", gnet->input[z]); } printf("\n"); printf("input: %f %f\n", mean_array(gnet->input, x_size), variance_array(gnet->input, x_size)); */ //cuda_push_array(gnet->input_gpu, gnet->input, x_size); //cuda_push_array(gnet->truth_gpu, gnet->truth, y_size); *gnet->seen += gnet->batch; forward_network(gnet); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1); copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1); anet->delta_gpu = imerror; forward_network(anet); backward_network(anet); //float genaloss = *anet->cost / anet->batch; //printf("%f\n", genaloss); scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1); //printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); //printf("features %f\n", cuda_mag_array(gnet->layers[gnet->n-1].delta_gpu, imlayer.outputs*imlayer.batch)); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1); backward_network(gnet); /* for(k = 0; k < gnet->n; ++k){ layer l = gnet->layers[k]; cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); printf("%d: %f %f\n", k, mean_array(l.output, l.outputs*l.batch), variance_array(l.output, l.outputs*l.batch)); } */ for(k = 0; k < gnet->batch; ++k){ int index = j*gnet->batch + k; copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gen); //randomize_data(merge); float aloss = train_network(anet, merge); //translate_image(im, 1); //scale_image(im, .5); //translate_image(im2, 1); //scale_image(im2, .5); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); save_image(im, "gen"); save_image(im2, "train"); } #endif /* if(aloss < .1){ anet->learning_rate = 0; } else if (aloss > .3){ anet->learning_rate = orig_rate; } */ update_network_gpu(gnet); free_data(merge); free_data(train); free_data(gen); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%10000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(gnet, buff); #endif free_network(gnet); free_network(anet); }
void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch) { #ifdef GPU char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *gnet = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); int i, j, k; layer imlayer = gnet->layers[gnet->n-1]; printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay); int imgs = gnet->batch*gnet->subdivisions; i = *gnet->seen/imgs; data train, buffer; list *plist = get_paths(train_images); char **paths = (char **)list_to_array(plist); load_args args= get_base_args(anet); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.threads=16; args.classes = 1; char *ls[2] = {"imagenet", "zzzzzzzz"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; gnet->train = 1; anet->train = 1; int x_size = gnet->inputs*gnet->batch; int y_size = gnet->truths*gnet->batch; float *imerror = cuda_make_array(0, y_size); float aloss_avg = -1; if (maxbatch == 0) maxbatch = gnet->max_batches; while (get_current_batch(gnet) < maxbatch) { { int cb = get_current_batch(gnet); float alpha = (float) cb / (maxbatch/2); if(alpha > 1) alpha = 1; float beta = 1 - alpha; printf("%f %f\n", alpha, beta); set_network_alpha_beta(gnet, alpha, beta); set_network_alpha_beta(anet, beta, alpha); } i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gen = copy_data(train); for (j = 0; j < imgs; ++j) { train.y.vals[j][0] = 1; gen.y.vals[j][0] = 0; } time=clock(); for (j = 0; j < gnet->subdivisions; ++j) { get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0); int z; for(z = 0; z < x_size; ++z){ gnet->input[z] = rand_normal(); } /* for(z = 0; z < gnet->batch; ++z){ float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs); scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag); } */ *gnet->seen += gnet->batch; forward_network(gnet); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1); copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1); anet->delta_gpu = imerror; forward_network(anet); backward_network(anet); //float genaloss = *anet->cost / anet->batch; scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1); backward_network(gnet); for(k = 0; k < gnet->batch; ++k){ int index = j*gnet->batch + k; copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gen); float aloss = train_network(anet, merge); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); save_image(im, "gen"); save_image(im2, "train"); } #endif update_network_gpu(gnet); free_data(merge); free_data(train); free_data(gen); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%10000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(gnet, buff); #endif free_network( gnet ); free_network( anet ); }
void backward_activation_layer(layer l, network net) { gradient_array(l.output, l.outputs * l.batch, l.activation, l.delta); copy_cpu(l.outputs * l.batch, l.delta, 1, net.delta, 1); }
void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, int tokenized) { srand(time(0)); unsigned char *text = 0; int *tokens = 0; size_t size; if (tokenized) { tokens = read_tokenized_data(filename, &size); } else { text = read_file(filename); size = strlen((const char*) text); } char *backup_directory = "/home/pjreddie/backup/"; char *base = basecfg(cfgfile); fprintf(stderr, "%s\n", base); real_t avg_loss = -1; network *net = load_network(cfgfile, weightfile, clear); int inputs = net->inputs; fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g, Inputs: %d %d %d\n", net->learning_rate, net->momentum, net->decay, inputs, net->batch, net->time_steps); int batch = net->batch; int steps = net->time_steps; if (clear) *net->seen = 0; int i = (*net->seen) / net->batch; int streams = batch / steps; size_t *offsets = calloc(streams, sizeof(size_t)); int j; for (j = 0; j < streams; ++j) { offsets[j] = rand_size_t() % size; } clock_t time; while (get_current_batch(net) < net->max_batches) { i += 1; time = clock(); real_t_pair p; if (tokenized) { p = get_rnn_token_data(tokens, offsets, inputs, size, streams, steps); } else { p = get_rnn_data(text, offsets, inputs, size, streams, steps); } copy_cpu(net->inputs * net->batch, p.x, 1, net->input, 1); copy_cpu(net->truths * net->batch, p.y, 1, net->truth, 1); real_t loss = train_network_datum(net) / (batch); free(p.x); free(p.y); if (avg_loss < 0) avg_loss = loss; avg_loss = avg_loss * .9 + loss * .1; size_t chars = get_current_batch(net) * batch; fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds, %f epochs\n", i, loss, avg_loss, get_current_rate(net), sec(clock() - time), (real_t) chars / size); for (j = 0; j < streams; ++j) { //printf("%d\n", j); if (rand() % 64 == 0) { //fprintf(stderr, "Reset\n"); offsets[j] = rand_size_t() % size; reset_network_state(net, j); } } if (i % 10000 == 0) { char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(net, buff); } if (i % 100 == 0) { char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(net, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(net, buff); }
void backward_lstm_layer(layer l, network state) { network s = { 0 }; s.train = state.train; int i; layer wf = *(l.wf); layer wi = *(l.wi); layer wg = *(l.wg); layer wo = *(l.wo); layer uf = *(l.uf); layer ui = *(l.ui); layer ug = *(l.ug); layer uo = *(l.uo); increment_layer(&wf, l.steps - 1); increment_layer(&wi, l.steps - 1); increment_layer(&wg, l.steps - 1); increment_layer(&wo, l.steps - 1); increment_layer(&uf, l.steps - 1); increment_layer(&ui, l.steps - 1); increment_layer(&ug, l.steps - 1); increment_layer(&uo, l.steps - 1); state.input += l.inputs * l.batch * (l.steps - 1); if (state.delta) state.delta += l.inputs * l.batch * (l.steps - 1); l.output += l.outputs * l.batch * (l.steps - 1); l.cell_cpu += l.outputs * l.batch * (l.steps - 1); l.delta += l.outputs * l.batch * (l.steps - 1); for (i = l.steps - 1; i >= 0; --i) { if (i != 0) copy_cpu(l.outputs * l.batch, l.cell_cpu - l.outputs * l.batch, 1, l.prev_cell_cpu, 1); copy_cpu(l.outputs * l.batch, l.cell_cpu, 1, l.c_cpu, 1); if (i != 0) copy_cpu(l.outputs * l.batch, l.output - l.outputs * l.batch, 1, l.prev_state_cpu, 1); copy_cpu(l.outputs * l.batch, l.output, 1, l.h_cpu, 1); l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs * l.batch; copy_cpu(l.outputs * l.batch, wf.output, 1, l.f_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uf.output, 1, l.f_cpu, 1); copy_cpu(l.outputs * l.batch, wi.output, 1, l.i_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ui.output, 1, l.i_cpu, 1); copy_cpu(l.outputs * l.batch, wg.output, 1, l.g_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ug.output, 1, l.g_cpu, 1); copy_cpu(l.outputs * l.batch, wo.output, 1, l.o_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uo.output, 1, l.o_cpu, 1); activate_array(l.f_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.i_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.g_cpu, l.outputs * l.batch, TANH); activate_array(l.o_cpu, l.outputs * l.batch, LOGISTIC); copy_cpu(l.outputs * l.batch, l.delta, 1, l.temp3_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.temp_cpu, 1); activate_array(l.temp_cpu, l.outputs * l.batch, TANH); copy_cpu(l.outputs * l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1); mul_cpu(l.outputs * l.batch, l.o_cpu, 1, l.temp2_cpu, 1); gradient_array(l.temp_cpu, l.outputs * l.batch, TANH, l.temp2_cpu); axpy_cpu(l.outputs * l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.temp_cpu, 1); activate_array(l.temp_cpu, l.outputs * l.batch, TANH); mul_cpu(l.outputs * l.batch, l.temp3_cpu, 1, l.temp_cpu, 1); gradient_array(l.o_cpu, l.outputs * l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wo.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wo, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, uo.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(uo, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.i_cpu, 1, l.temp_cpu, 1); gradient_array(l.g_cpu, l.outputs * l.batch, TANH, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wg.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wg, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, ug.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(ug, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.g_cpu, 1, l.temp_cpu, 1); gradient_array(l.i_cpu, l.outputs * l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wi.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wi, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, ui.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(ui, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1); gradient_array(l.f_cpu, l.outputs * l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wf.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wf, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, uf.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(uf, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.f_cpu, 1, l.temp_cpu, 1); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, l.dc_cpu, 1); state.input -= l.inputs * l.batch; if (state.delta) state.delta -= l.inputs * l.batch; l.output -= l.outputs * l.batch; l.cell_cpu -= l.outputs * l.batch; l.delta -= l.outputs * l.batch; increment_layer(&wf, -1); increment_layer(&wi, -1); increment_layer(&wg, -1); increment_layer(&wo, -1); increment_layer(&uf, -1); increment_layer(&ui, -1); increment_layer(&ug, -1); increment_layer(&uo, -1); } }
void backward_cost_layer(const cost_layer l, network_state state) { copy_cpu(l.batch*l.inputs, l.delta, 1, state.delta, 1); }
void forward_shortcut_layer(const layer l, network net) { copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1); shortcut_cpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output); activate_array(l.output, l.outputs*l.batch, l.activation); }
void test_go(char *cfg, char *weights, int multi) { network net = parse_network_cfg(cfg); if(weights){ load_weights(&net, weights); } srand(time(0)); set_batch_network(&net, 1); float *board = calloc(19*19, sizeof(float)); float *move = calloc(19*19, sizeof(float)); int color = 1; while(1){ float *output = network_predict(net, board); copy_cpu(19*19, output, 1, move, 1); int i; if(multi){ image bim = float_to_image(19, 19, 1, board); for(i = 1; i < 8; ++i){ rotate_image_cw(bim, i); if(i >= 4) flip_image(bim); float *output = network_predict(net, board); image oim = float_to_image(19, 19, 1, output); if(i >= 4) flip_image(oim); rotate_image_cw(oim, -i); axpy_cpu(19*19, 1, output, 1, move, 1); if(i >= 4) flip_image(bim); rotate_image_cw(bim, -i); } scal_cpu(19*19, 1./8., move, 1); } for(i = 0; i < 19*19; ++i){ if(board[i]) move[i] = 0; } int indexes[nind]; int row, col; top_k(move, 19*19, nind, indexes); print_board(board, color, indexes); for(i = 0; i < nind; ++i){ int index = indexes[i]; row = index / 19; col = index % 19; printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100); } //if(color == 1) printf("\u25EF Enter move: "); //else printf("\u25C9 Enter move: "); if(color == 1) printf("X Enter move: "); else printf("O Enter move: "); char c; char *line = fgetl(stdin); int picked = 1; int dnum = sscanf(line, "%d", &picked); int cnum = sscanf(line, "%c", &c); if (strlen(line) == 0 || dnum) { --picked; if (picked < nind){ int index = indexes[picked]; row = index / 19; col = index % 19; board[row*19 + col] = 1; } } else if (cnum){ if (c <= 'T' && c >= 'A'){ int num = sscanf(line, "%c %d", &c, &row); row = (inverted)?19 - row : row-1; col = c - 'A'; if (col > 7 && noi) col -= 1; if (num == 2) board[row*19 + col] = 1; } else if (c == 'p') { // Pass } else if(c=='b' || c == 'w'){ char g; int num = sscanf(line, "%c %c %d", &g, &c, &row); row = (inverted)?19 - row : row-1; col = c - 'A'; if (col > 7 && noi) col -= 1; if (num == 3) board[row*19 + col] = (g == 'b') ? color : -color; } else if(c == 'c'){ char g; int num = sscanf(line, "%c %c %d", &g, &c, &row); row = (inverted)?19 - row : row-1; col = c - 'A'; if (col > 7 && noi) col -= 1; if (num == 3) board[row*19 + col] = 0; } } free(line); flip_board(board); color = -color; } }