void forward_compact_layer_gpu(const layer l, network_state state) { int i, b; for (b=0;b<l.batch;b++) { if (l.method==0) // add { // copy first section copy_ongpu(l.outputs, state.input+b*l.inputs, 1, l.output_gpu+b*l.outputs, 1); // add other splits for (i=1;i<l.index;i++) { axpy_ongpu(l.outputs, 1, state.input+b*l.inputs+i*l.outputs, 1, l.output_gpu+b*l.outputs, 1); } } else if (l.method==1) // sub { // copy first section copy_ongpu(l.outputs, state.input+b*l.inputs, 1, l.output_gpu+b*l.outputs, 1); // sub other splits for (i=1;i<l.index;i++) { axpy_ongpu(l.outputs, -1, state.input+b*l.inputs+i*l.outputs, 1, l.output_gpu+b*l.outputs, 1); } } else if (l.method==2) // max { compact_forward_max_gpu(l.w, l.h, l.c, l.index, state.input+b*l.inputs, l.output_gpu+b*l.outputs, l.indexes_gpu); } else if (l.method==10) { compact_forward_padd_gpu(l.w, l.h, l.c, state.input+b*l.inputs, l.output_gpu+b*l.outputs); } else if (l.method==12) { compact_forward_pmax_gpu(l.w, l.h, l.c, state.input+b*l.inputs, l.output_gpu+b*l.outputs, l.indexes_gpu); } } activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); }
void backward_batchnorm_layer_gpu(const layer l, network_state state) { backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu); scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu); fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu); normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1); }
void backward_crnn_layer_gpu(layer_t l, network_state state) { NETWORK_STATE(s); s.train = state.train; int i; layer_t input_layer = *(l.input_layer); layer_t self_layer = *(l.self_layer); layer_t output_layer = *(l.output_layer); increment_layer(&input_layer, l.steps - 1); increment_layer(&self_layer, l.steps - 1); increment_layer(&output_layer, l.steps - 1); l.state_gpu += l.hidden*l.batch*l.steps; for (i = l.steps-1; i >= 0; --i) { copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1); axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); s.input = l.state_gpu; s.delta = self_layer.delta_gpu; backward_convolutional_layer_gpu(output_layer, s); l.state_gpu -= l.hidden*l.batch; s.input = l.state_gpu; s.delta = self_layer.delta_gpu - l.hidden*l.batch; if (i == 0) s.delta = NULL; backward_convolutional_layer_gpu(self_layer, s); copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1); s.input = state.input + i*l.inputs*l.batch; if(state.delta) s.delta = state.delta + i*l.inputs*l.batch; else s.delta = NULL; backward_convolutional_layer_gpu(input_layer, s); increment_layer(&input_layer, -1); increment_layer(&self_layer, -1); increment_layer(&output_layer, -1); } }
void forward_cost_layer_gpu(cost_layer l, network_state state) { if (!state.truth) return; if (l.cost_type == MASKED) { mask_ongpu(l.batch*l.inputs, state.input, state.truth); } copy_ongpu(l.batch*l.inputs, state.truth, 1, l.delta_gpu, 1); axpy_ongpu(l.batch*l.inputs, -1, state.input, 1, l.delta_gpu, 1); cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); }
void backward_route_layer_gpu(const route_layer l, network net) { int i, j; int offset = 0; for(i = 0; i < l.n; ++i){ int index = l.input_layers[i]; float *delta = net.layers[index].delta_gpu; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ copy_ongpu(input_size, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1); } offset += input_size; } }
void forward_route_layer_gpu(const route_layer l, network_state state) { int i, j; int offset = 0; for(i = 0; i < l.n; ++i){ int index = l.input_layers[i]; float *input = state.net.layers[index].output_gpu; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ copy_ongpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1); } offset += input_size; } }
void backward_batchnorm_layer_gpu(const layer l, network_state state) { #ifdef CUDNN float one = 1; float zero = 0; cudnnBatchNormalizationBackward(cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, &one, &zero, &one, &one, l.dstTensorDesc, l.x_gpu, l.dstTensorDesc, l.delta_gpu, l.dstTensorDesc, l.x_norm_gpu, l.normTensorDesc, l.scales_gpu, l.scale_updates_gpu, l.bias_updates_gpu, .00001, l.mean_gpu, l.variance_gpu); copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1); #else backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h); backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu); scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu); fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu); normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); #endif if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1); }
void forward_cost_layer_gpu(cost_layer l, network_state state) { if (!state.truth) return; if (l.cost_type == MASKED) { mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth); } if(l.cost_type == SMOOTH){ smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu); } else { copy_ongpu(l.batch*l.inputs, state.truth, 1, l.delta_gpu, 1); axpy_ongpu(l.batch*l.inputs, -1, state.input, 1, l.delta_gpu, 1); } cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); }
void forward_rnn_layer_gpu(layer l, network_state state) { network_state s = {0}; s.train = state.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1); if(state.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); for (i = 0; i < l.steps; ++i) { s.input = state.input; forward_connected_layer_gpu(input_layer, s); s.input = l.state_gpu; forward_connected_layer_gpu(self_layer, s); float *old_state = l.state_gpu; if(state.train) l.state_gpu += l.hidden*l.batch; if(l.shortcut){ copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1); }else{ fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); } axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); s.input = l.state_gpu; forward_connected_layer_gpu(output_layer, s); state.input += l.inputs*l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } }
void backward_rnn_layer_gpu(layer l, network_state state) { network_state s = {0}; s.train = state.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); increment_layer(&input_layer, l.steps - 1); increment_layer(&self_layer, l.steps - 1); increment_layer(&output_layer, l.steps - 1); l.state_gpu += l.hidden*l.batch*l.steps; for (i = l.steps-1; i >= 0; --i) { s.input = l.state_gpu; s.delta = self_layer.delta_gpu; backward_connected_layer_gpu(output_layer, s); l.state_gpu -= l.hidden*l.batch; copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); s.input = l.state_gpu; s.delta = self_layer.delta_gpu - l.hidden*l.batch; if (i == 0) s.delta = 0; backward_connected_layer_gpu(self_layer, s); //copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1); s.input = state.input + i*l.inputs*l.batch; if(state.delta) s.delta = state.delta + i*l.inputs*l.batch; else s.delta = 0; backward_connected_layer_gpu(input_layer, s); increment_layer(&input_layer, -1); increment_layer(&self_layer, -1); increment_layer(&output_layer, -1); } }
void optimize_picture(network *net, image orig, int max_layer, float scale, float rate, float thresh, int norm) { //scale_image(orig, 2); //translate_image(orig, -1); net->n = max_layer + 1; int dx = rand()%16 - 8; int dy = rand()%16 - 8; int flip = rand()%2; image crop = crop_image(orig, dx, dy, orig.w, orig.h); image im = resize_image(crop, (int)(orig.w * scale), (int)(orig.h * scale)); if(flip) flip_image(im); resize_network(net, im.w, im.h); layer_t last = net->layers[net->n-1]; //net->layers[net->n - 1].activation = LINEAR; image delta = make_image(im.w, im.h, im.c); NETWORK_STATE(state); #ifdef GPU state.input = cuda_make_array(im.data, im.w*im.h*im.c); state.delta = cuda_make_array(im.data, im.w*im.h*im.c); forward_network_gpu(*net, state); copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1); cuda_pull_array(last.delta_gpu, last.delta, last.outputs); calculate_loss(last.delta, last.delta, last.outputs, thresh); cuda_push_array(last.delta_gpu, last.delta, last.outputs); backward_network_gpu(*net, state); cuda_pull_array(state.delta, delta.data, im.w*im.h*im.c); cuda_free(state.input); cuda_free(state.delta); #else state.input = im.data; state.delta = delta.data; forward_network(*net, state); fltcpy(last.delta, last.output, last.outputs); calculate_loss(last.output, last.delta, last.outputs, thresh); backward_network(*net, state); #endif if(flip) flip_image(delta); //normalize_array(delta.data, delta.w*delta.h*delta.c); image resized = resize_image(delta, orig.w, orig.h); image out = crop_image(resized, -dx, -dy, orig.w, orig.h); /* image g = grayscale_image(out); free_image(out); out = g; */ //rate = rate / abs_mean(out.data, out.w*out.h*out.c); if(norm) normalize_array(out.data, out.w*out.h*out.c); fltaddmul(orig.data, out.data, orig.w * orig.h * orig.c, rate); /* normalize_array(orig.data, orig.w*orig.h*orig.c); scale_image(orig, sqrt(var)); translate_image(orig, mean); */ //translate_image(orig, 1); //scale_image(orig, .5); //normalize_image(orig); constrain_image(orig); free_image(crop); free_image(im); free_image(delta); free_image(resized); free_image(out); }
void backward_cost_layer_gpu(const cost_layer l, network_state state) { copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, state.delta, 1); }
void forward_shortcut_layer_gpu(const layer l, network_state state) { copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1); shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu); activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); }
void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg, char *aweight, int clear) { #ifdef GPU //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list"; //char *style_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; char *style_images = "/home/pjreddie/zelda.txt"; char *backup_directory = "/home/pjreddie/backup/"; srand(time(0)); network fnet = load_network(fcfg, fweight, clear); network gnet = load_network(gcfg, gweight, clear); network anet = load_network(acfg, aweight, clear); char *gbase = basecfg(gcfg); char *abase = basecfg(acfg); printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet.learning_rate, gnet.momentum, gnet.decay); int imgs = gnet.batch*gnet.subdivisions; int i = *gnet.seen/imgs; data train, tbuffer; data style, sbuffer; list *slist = get_paths(style_images); char **spaths = (char **)list_to_array(slist); list *tlist = get_paths(train_images); char **tpaths = (char **)list_to_array(tlist); load_args targs= get_base_args(gnet); targs.paths = tpaths; targs.n = imgs; targs.m = tlist->size; targs.d = &tbuffer; targs.type = CLASSIFICATION_DATA; targs.classes = 1; char *ls[1] = {"zelda"}; targs.labels = ls; load_args sargs = get_base_args(gnet); sargs.paths = spaths; sargs.n = imgs; sargs.m = slist->size; sargs.d = &sbuffer; sargs.type = CLASSIFICATION_DATA; sargs.classes = 1; sargs.labels = ls; pthread_t tload_thread = load_data_in_thread(targs); pthread_t sload_thread = load_data_in_thread(sargs); clock_t time; float aloss_avg = -1; float floss_avg = -1; network_state fstate = {}; fstate.index = 0; fstate.net = fnet; int x_size = get_network_input_size(fnet)*fnet.batch; int y_size = get_network_output_size(fnet)*fnet.batch; fstate.input = cuda_make_array(0, x_size); fstate.truth = cuda_make_array(0, y_size); fstate.delta = cuda_make_array(0, x_size); fstate.train = 1; float *X = (float*)calloc(x_size, sizeof(float)); float *y = (float*)calloc(y_size, sizeof(float)); float *ones = cuda_make_array(0, anet.batch); float *zeros = cuda_make_array(0, anet.batch); fill_ongpu(anet.batch, .99, ones, 1); fill_ongpu(anet.batch, .01, zeros, 1); network_state astate = {}; astate.index = 0; astate.net = anet; int ax_size = get_network_input_size(anet)*anet.batch; int ay_size = get_network_output_size(anet)*anet.batch; astate.input = 0; astate.truth = ones; astate.delta = cuda_make_array(0, ax_size); astate.train = 1; network_state gstate = {}; gstate.index = 0; gstate.net = gnet; int gx_size = get_network_input_size(gnet)*gnet.batch; int gy_size = get_network_output_size(gnet)*gnet.batch; gstate.input = cuda_make_array(0, gx_size); gstate.truth = 0; gstate.delta = 0; gstate.train = 1; while (get_current_batch(gnet) < gnet.max_batches) { i += 1; time=clock(); pthread_join(tload_thread, 0); pthread_join(sload_thread, 0); train = tbuffer; style = sbuffer; tload_thread = load_data_in_thread(targs); sload_thread = load_data_in_thread(sargs); printf("Loaded: %lf seconds\n", sec(clock()-time)); data generated = copy_data(train); time=clock(); int j, k; float floss = 0; for(j = 0; j < fnet.subdivisions; ++j){ layer imlayer = gnet.layers[gnet.n - 1]; get_next_batch(train, fnet.batch, j*fnet.batch, X, y); cuda_push_array(fstate.input, X, x_size); cuda_push_array(gstate.input, X, gx_size); *gnet.seen += gnet.batch; forward_network_gpu(fnet, fstate); float *feats = fnet.layers[fnet.n - 2].output_gpu; copy_ongpu(y_size, feats, 1, fstate.truth, 1); forward_network_gpu(gnet, gstate); float *gen = gnet.layers[gnet.n-1].output_gpu; copy_ongpu(x_size, gen, 1, fstate.input, 1); fill_ongpu(x_size, 0, fstate.delta, 1); forward_network_gpu(fnet, fstate); backward_network_gpu(fnet, fstate); //HERE astate.input = gen; fill_ongpu(ax_size, 0, astate.delta, 1); forward_network_gpu(anet, astate); backward_network_gpu(anet, astate); float *delta = imlayer.delta_gpu; fill_ongpu(x_size, 0, delta, 1); scal_ongpu(x_size, 100, astate.delta, 1); scal_ongpu(x_size, .00001, fstate.delta, 1); axpy_ongpu(x_size, 1, fstate.delta, 1, delta, 1); axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1); //fill_ongpu(x_size, 0, delta, 1); //cuda_push_array(delta, X, x_size); //axpy_ongpu(x_size, -1, imlayer.output_gpu, 1, delta, 1); //printf("pix error: %f\n", cuda_mag_array(delta, x_size)); printf("fea error: %f\n", cuda_mag_array(fstate.delta, x_size)); printf("adv error: %f\n", cuda_mag_array(astate.delta, x_size)); //axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1); backward_network_gpu(gnet, gstate); floss += get_network_cost(fnet) /(fnet.subdivisions*fnet.batch); cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); for(k = 0; k < gnet.batch; ++k){ int index = j*gnet.batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1); generated.y.vals[index][0] = .01; } } /* image sim = float_to_image(anet.w, anet.h, anet.c, style.X.vals[j]); show_image(sim, "style"); cvWaitKey(0); */ harmless_update_network_gpu(anet); data merge = concat_data(style, generated); randomize_data(merge); float aloss = train_network(anet, merge); update_network_gpu(gnet); free_data(merge); free_data(train); free_data(generated); free_data(style); if (aloss_avg < 0) aloss_avg = aloss; if (floss_avg < 0) floss_avg = floss; aloss_avg = aloss_avg*.9 + aloss*.1; floss_avg = floss_avg*.9 + floss*.1; printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, floss, aloss, floss_avg, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, gbase, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, gbase); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } #endif }