void forward_detection_layer_gpu(const detection_layer l, network_state state) { if(!state.train){ copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1); return; } float *in_cpu = calloc(l.batch*l.inputs, sizeof(float)); float *truth_cpu = 0; if(state.truth){ int num_truth = l.batch*l.side*l.side*(1+l.coords+l.classes); truth_cpu = calloc(num_truth, sizeof(float)); cuda_pull_array(state.truth, truth_cpu, num_truth); } cuda_pull_array(state.input, in_cpu, l.batch*l.inputs); network_state cpu_state = state; cpu_state.train = state.train; cpu_state.truth = truth_cpu; cpu_state.input = in_cpu; forward_detection_layer(l, cpu_state); cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs); cuda_push_array(l.delta_gpu, l.delta, l.batch*l.inputs); free(cpu_state.input); if(cpu_state.truth) free(cpu_state.truth); }
void pull_connected_layer(connected_layer l) { cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs); cuda_pull_array(l.biases_gpu, l.biases, l.outputs); cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs); cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs); }
void pull_local_layer(local_layer l) { int locations = l.out_w*l.out_h; int size = l.size*l.size*l.c*l.n*locations; cuda_pull_array(l.weights_gpu, l.weights, size); cuda_pull_array(l.biases_gpu, l.biases, l.outputs); }
void backward_detection_layer_gpu(detection_layer l, network_state state) { int outputs = get_detection_layer_output_size(l); float *in_cpu = calloc(l.batch*l.inputs, sizeof(float)); float *delta_cpu = calloc(l.batch*l.inputs, sizeof(float)); float *truth_cpu = 0; if(state.truth){ truth_cpu = calloc(l.batch*outputs, sizeof(float)); cuda_pull_array(state.truth, truth_cpu, l.batch*outputs); } network_state cpu_state; cpu_state.train = state.train; cpu_state.input = in_cpu; cpu_state.truth = truth_cpu; cpu_state.delta = delta_cpu; cuda_pull_array(state.input, in_cpu, l.batch*l.inputs); cuda_pull_array(state.delta, delta_cpu, l.batch*l.inputs); cuda_pull_array(l.delta_gpu, l.delta, l.batch*outputs); backward_detection_layer(l, cpu_state); cuda_push_array(state.delta, delta_cpu, l.batch*l.inputs); if (truth_cpu) free(truth_cpu); free(in_cpu); free(delta_cpu); }
void pull_weights(layer l) { if(l.type == CONVOLUTIONAL){ cuda_pull_array(l.biases_gpu, l.biases, l.n); cuda_pull_array(l.weights_gpu, l.weights, l.n*l.size*l.size*l.c); if(l.scales) cuda_pull_array(l.scales_gpu, l.scales, l.n); } else if(l.type == CONNECTED){ cuda_pull_array(l.biases_gpu, l.biases, l.outputs); cuda_pull_array(l.weights_gpu, l.weights, l.outputs*l.inputs); } }
void test_resize(char *filename) { image im = load_image(filename, 0,0, 3); float mag = mag_array(im.data, im.w*im.h*im.c); printf("L2 Norm: %f\n", mag); image gray = grayscale_image(im); image sat2 = copy_image(im); saturate_image(sat2, 2); image sat5 = copy_image(im); saturate_image(sat5, .5); image exp2 = copy_image(im); exposure_image(exp2, 2); image exp5 = copy_image(im); exposure_image(exp5, .5); #ifdef GPU image r = resize_image(im, im.w, im.h); image black = make_image(im.w*2 + 3, im.h*2 + 3, 9); image black2 = make_image(im.w, im.h, 3); float *r_gpu = cuda_make_array(r.data, r.w*r.h*r.c); float *black_gpu = cuda_make_array(black.data, black.w*black.h*black.c); float *black2_gpu = cuda_make_array(black2.data, black2.w*black2.h*black2.c); shortcut_gpu(3, r.w, r.h, 1, r_gpu, black.w, black.h, 3, black_gpu); //flip_image(r); //shortcut_gpu(3, r.w, r.h, 1, r.data, black.w, black.h, 3, black.data); shortcut_gpu(3, black.w, black.h, 3, black_gpu, black2.w, black2.h, 1, black2_gpu); cuda_pull_array(black_gpu, black.data, black.w*black.h*black.c); cuda_pull_array(black2_gpu, black2.data, black2.w*black2.h*black2.c); show_image_layers(black, "Black"); show_image(black2, "Recreate"); #endif show_image(im, "Original"); show_image(gray, "Gray"); show_image(sat2, "Saturation-2"); show_image(sat5, "Saturation-.5"); show_image(exp2, "Exposure-2"); show_image(exp5, "Exposure-.5"); #ifdef OPENCV cvWaitKey(0); #endif }
float cuda_mag_array(float *x_gpu, size_t n) { float *temp = calloc(n, sizeof(float)); cuda_pull_array(x_gpu, temp, n); float m = mag_array(temp, n); free(temp); return m; }
void forward_region_layer_gpu(const region_layer l, network_state state) { float *in_cpu = calloc(l.batch*l.inputs, sizeof(float)); float *truth_cpu = 0; if(state.truth){ truth_cpu = calloc(l.batch*l.outputs, sizeof(float)); cuda_pull_array(state.truth, truth_cpu, l.batch*l.outputs); } cuda_pull_array(state.input, in_cpu, l.batch*l.inputs); network_state cpu_state; cpu_state.train = state.train; cpu_state.truth = truth_cpu; cpu_state.input = in_cpu; forward_region_layer(l, cpu_state); cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs); cuda_push_array(l.delta_gpu, l.delta, l.batch*l.inputs); free(cpu_state.input); if(cpu_state.truth) free(cpu_state.truth); }
void forward_cost_layer_gpu(cost_layer l, network *net) { if (!net->truth) return; if(l.smooth){ scal_gpu(l.batch*l.inputs, (1-l.smooth), net->truth_gpu, 1); add_gpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net->truth_gpu, 1); } if(l.cost_type == SMOOTH){ smooth_l1_gpu(l.batch*l.inputs, net->input_gpu, net->truth_gpu, l.delta_gpu, l.output_gpu); } else if (l.cost_type == L1){ l1_gpu(l.batch*l.inputs, net->input_gpu, net->truth_gpu, l.delta_gpu, l.output_gpu); } else if (l.cost_type == WGAN){ wgan_gpu(l.batch*l.inputs, net->input_gpu, net->truth_gpu, l.delta_gpu, l.output_gpu); } else { l2_gpu(l.batch*l.inputs, net->input_gpu, net->truth_gpu, l.delta_gpu, l.output_gpu); } if (l.cost_type == SEG && l.noobject_scale != 1) { scale_mask_gpu(l.batch*l.inputs, l.delta_gpu, 0, net->truth_gpu, l.noobject_scale); scale_mask_gpu(l.batch*l.inputs, l.output_gpu, 0, net->truth_gpu, l.noobject_scale); } if (l.cost_type == MASKED) { mask_gpu(l.batch*l.inputs, net->delta_gpu, SECRET_NUM, net->truth_gpu, 0); } if(l.ratio){ cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); qsort(l.delta, l.batch*l.inputs, sizeof(float), float_abs_compare); int n = (1-l.ratio) * l.batch*l.inputs; float thresh = l.delta[n]; thresh = 0; printf("%f\n", thresh); supp_gpu(l.batch*l.inputs, thresh, l.delta_gpu, 1); } if(l.thresh){ supp_gpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1); } cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs); l.cost[0] = sum_array(l.output, l.batch*l.inputs); }
void forward_region_layer_gpu(const region_layer l, network_state state) { /* if(!state.train){ copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1); return; } */ flatten_ongpu(state.input, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 1, l.output_gpu); if(l.softmax_tree){ int i; int count = 5; for (i = 0; i < l.softmax_tree->groups; ++i) { int group_size = l.softmax_tree->group_size[i]; softmax_gpu(l.output_gpu+count, group_size, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + count); count += group_size; } }else if (l.softmax){ softmax_gpu(l.output_gpu+5, l.classes, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + 5); } float *in_cpu = calloc(l.batch*l.inputs, sizeof(float)); float *truth_cpu = 0; if(state.truth){ int num_truth = l.batch*l.truths; truth_cpu = calloc(num_truth, sizeof(float)); cuda_pull_array(state.truth, truth_cpu, num_truth); } cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs); //cudaStreamSynchronize(get_cuda_stream()); network_state cpu_state = state; cpu_state.train = state.train; cpu_state.truth = truth_cpu; cpu_state.input = in_cpu; forward_region_layer(l, cpu_state); //cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs); free(cpu_state.input); if(!state.train) return; cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs); //cudaStreamSynchronize(get_cuda_stream()); if(cpu_state.truth) free(cpu_state.truth); }
float cuda_compare(float *x_gpu, float *x, size_t n, char *s) { float *tmp = calloc(n, sizeof(float)); cuda_pull_array(x_gpu, tmp, n); //int i; //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]); axpy_cpu(n, -1, x, 1, tmp, 1); float err = dot_cpu(n, tmp, 1, tmp, 1); printf("Error %s: %f\n", s, sqrt(err / n)); free(tmp); return err; }
image get_network_image_layer(network net, int i) { layer l = net.layers[i]; #ifdef GPU cuda_pull_array(l.output_gpu, l.output, l.outputs); #endif if (l.out_w && l.out_h && l.out_c){ return float_to_image(l.out_w, l.out_h, l.out_c, l.output); } image def = {0}; return def; }
void forward_cost_layer_gpu(cost_layer l, network_state state) { if (!state.truth) return; if (l.cost_type == MASKED) { mask_ongpu(l.batch*l.inputs, state.input, state.truth); } copy_ongpu(l.batch*l.inputs, state.truth, 1, l.delta_gpu, 1); axpy_ongpu(l.batch*l.inputs, -1, state.input, 1, l.delta_gpu, 1); cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); }
void forward_iseg_layer_gpu(const layer l, network net) { copy_gpu(l.batch * l.inputs, net.input_gpu, 1, l.output_gpu, 1, net.st); int b; for (b = 0; b < l.batch; ++b) { activate_array_gpu(l.output_gpu + b * l.outputs, l.classes * l.w * l.h, LOGISTIC, net.st); //if(l.extra) activate_array_gpu(l.output_gpu + b*l.outputs + l.classes*l.w*l.h, l.extra*l.w*l.h, LOGISTIC); } cuda_pull_array(l.output_gpu, net.input, l.batch * l.inputs); forward_iseg_layer(l, net); cuda_push_array(l.delta_gpu, l.delta, l.batch * l.outputs); }
void pull_connected_layer(connected_layer l) { cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs); cuda_pull_array(l.biases_gpu, l.biases, l.outputs); cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs); cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs); if (l.batch_normalize){ cuda_pull_array(l.scales_gpu, l.scales, l.outputs); cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs); cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs); } }
void forward_cost_layer_gpu(cost_layer l, network_state state) { if (!state.truth) return; if(l.smooth){ scal_ongpu(l.batch*l.inputs, (1-l.smooth), state.truth, 1); add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, state.truth, 1); } if (l.cost_type == MASKED) { mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth); } if(l.cost_type == SMOOTH){ smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu); } else if (l.cost_type == L1){ l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu); } else { l2_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu); } if(l.ratio){ cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); qsort(l.delta, l.batch*l.inputs, sizeof(float), float_abs_compare); int n = (1-l.ratio) * l.batch*l.inputs; float thresh = l.delta[n]; thresh = 0; printf("%f\n", thresh); supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1); } if(l.thresh){ supp_ongpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1); } cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs); l.cost[0] = sum_array(l.output, l.batch*l.inputs); }
void vec_char_rnn(char *cfgfile, char *weightfile, char *seed) { char *base = basecfg(cfgfile); fprintf(stderr, "%s\n", base); network net = parse_network_cfg(cfgfile); if(weightfile){ load_weights(&net, weightfile); } int inputs = get_network_input_size(net); int c; int seed_len = strlen(seed); float *input = calloc(inputs, sizeof(float)); int i; char *line; while((line=fgetl(stdin)) != 0){ reset_rnn_state(net, 0); for(i = 0; i < seed_len; ++i){ c = seed[i]; input[(int)c] = 1; network_predict(net, input); input[(int)c] = 0; } strip(line); int str_len = strlen(line); for(i = 0; i < str_len; ++i){ c = line[i]; input[(int)c] = 1; network_predict(net, input); input[(int)c] = 0; } c = ' '; input[(int)c] = 1; network_predict(net, input); input[(int)c] = 0; layer l = net.layers[0]; #ifdef GPU cuda_pull_array(l.output_gpu, l.output, l.outputs); #endif printf("%s", line); for(i = 0; i < l.outputs; ++i){ printf(",%g", l.output[i]); } printf("\n"); } }
void forward_cost_layer_gpu(cost_layer l, network_state state) { if (!state.truth) return; if (l.cost_type == MASKED) { mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth); } if(l.cost_type == SMOOTH){ smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu); } else { copy_ongpu(l.batch*l.inputs, state.truth, 1, l.delta_gpu, 1); axpy_ongpu(l.batch*l.inputs, -1, state.input, 1, l.delta_gpu, 1); } cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); }
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc) { float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M)); float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K)); float *C_gpu = cuda_make_array(C, ldc*M); gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); cuda_pull_array(C_gpu, C, ldc*M); cuda_free(A_gpu); cuda_free(B_gpu); cuda_free(C_gpu); }
void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters) { int iter = 0; for (iter = 0; iter < iters; ++iter) { image delta = make_image(recon.w, recon.h, recon.c); NETWORK_STATE(state); #ifdef GPU state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c); state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c); state.truth = cuda_make_array(features, get_network_output_size(net)); forward_network_gpu(net, state); backward_network_gpu(net, state); cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c); cuda_free(state.input); cuda_free(state.delta); cuda_free(state.truth); #else state.input = recon.data; state.delta = delta.data; state.truth = features; forward_network(net, state); backward_network(net, state); #endif fltadd(update.data, delta.data, recon.w * recon.h * recon.c); smooth(recon, update, lambda, smooth_size); fltaddmul(recon.data, update.data, recon.w * recon.h * recon.c, rate); scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1); //float mag = mag_array(recon.data, recon.w*recon.h*recon.c); //scal_cpu(recon.w*recon.h*recon.c, 600/mag, recon.data, 1); constrain_image(recon); free_image(delta); } }
void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size) { scale_image(recon, 2); translate_image(recon, -1); image delta = make_image(recon.w, recon.h, recon.c); network_state state = {0}; #ifdef GPU state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c); state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c); state.truth = cuda_make_array(features, get_network_output_size(net)); forward_network_gpu(net, state); backward_network_gpu(net, state); cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c); cuda_free(state.input); cuda_free(state.delta); cuda_free(state.truth); #else state.input = recon.data; state.delta = delta.data; state.truth = features; forward_network(net, state); backward_network(net, state); #endif axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1); smooth(recon, update, lambda, smooth_size); axpy_cpu(recon.w*recon.h*recon.c, rate, update.data, 1, recon.data, 1); scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1); translate_image(recon, 1); scale_image(recon, .5); constrain_image(recon); free_image(delta); }
void pull_softmax_layer_output(const softmax_layer layer) { cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch); }
void optimize_picture(network *net, image orig, int max_layer, float scale, float rate, float thresh, int norm) { //scale_image(orig, 2); //translate_image(orig, -1); net->n = max_layer + 1; int dx = rand()%16 - 8; int dy = rand()%16 - 8; int flip = rand()%2; image crop = crop_image(orig, dx, dy, orig.w, orig.h); image im = resize_image(crop, (int)(orig.w * scale), (int)(orig.h * scale)); if(flip) flip_image(im); resize_network(net, im.w, im.h); layer_t last = net->layers[net->n-1]; //net->layers[net->n - 1].activation = LINEAR; image delta = make_image(im.w, im.h, im.c); NETWORK_STATE(state); #ifdef GPU state.input = cuda_make_array(im.data, im.w*im.h*im.c); state.delta = cuda_make_array(im.data, im.w*im.h*im.c); forward_network_gpu(*net, state); copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1); cuda_pull_array(last.delta_gpu, last.delta, last.outputs); calculate_loss(last.delta, last.delta, last.outputs, thresh); cuda_push_array(last.delta_gpu, last.delta, last.outputs); backward_network_gpu(*net, state); cuda_pull_array(state.delta, delta.data, im.w*im.h*im.c); cuda_free(state.input); cuda_free(state.delta); #else state.input = im.data; state.delta = delta.data; forward_network(*net, state); fltcpy(last.delta, last.output, last.outputs); calculate_loss(last.output, last.delta, last.outputs, thresh); backward_network(*net, state); #endif if(flip) flip_image(delta); //normalize_array(delta.data, delta.w*delta.h*delta.c); image resized = resize_image(delta, orig.w, orig.h); image out = crop_image(resized, -dx, -dy, orig.w, orig.h); /* image g = grayscale_image(out); free_image(out); out = g; */ //rate = rate / abs_mean(out.data, out.w*out.h*out.c); if(norm) normalize_array(out.data, out.w*out.h*out.c); fltaddmul(orig.data, out.data, orig.w * orig.h * orig.c, rate); /* normalize_array(orig.data, orig.w*orig.h*orig.c); scale_image(orig, sqrt(var)); translate_image(orig, mean); */ //translate_image(orig, 1); //scale_image(orig, .5); //normalize_image(orig); constrain_image(orig); free_image(crop); free_image(im); free_image(delta); free_image(resized); free_image(out); }
void pull_network_output(network net) { layer l = get_network_output_layer(net); cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); }
void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg, char *aweight, int clear) { #ifdef GPU //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list"; //char *style_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; char *style_images = "/home/pjreddie/zelda.txt"; char *backup_directory = "/home/pjreddie/backup/"; srand(time(0)); network fnet = load_network(fcfg, fweight, clear); network gnet = load_network(gcfg, gweight, clear); network anet = load_network(acfg, aweight, clear); char *gbase = basecfg(gcfg); char *abase = basecfg(acfg); printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet.learning_rate, gnet.momentum, gnet.decay); int imgs = gnet.batch*gnet.subdivisions; int i = *gnet.seen/imgs; data train, tbuffer; data style, sbuffer; list *slist = get_paths(style_images); char **spaths = (char **)list_to_array(slist); list *tlist = get_paths(train_images); char **tpaths = (char **)list_to_array(tlist); load_args targs= get_base_args(gnet); targs.paths = tpaths; targs.n = imgs; targs.m = tlist->size; targs.d = &tbuffer; targs.type = CLASSIFICATION_DATA; targs.classes = 1; char *ls[1] = {"zelda"}; targs.labels = ls; load_args sargs = get_base_args(gnet); sargs.paths = spaths; sargs.n = imgs; sargs.m = slist->size; sargs.d = &sbuffer; sargs.type = CLASSIFICATION_DATA; sargs.classes = 1; sargs.labels = ls; pthread_t tload_thread = load_data_in_thread(targs); pthread_t sload_thread = load_data_in_thread(sargs); clock_t time; float aloss_avg = -1; float floss_avg = -1; network_state fstate = {}; fstate.index = 0; fstate.net = fnet; int x_size = get_network_input_size(fnet)*fnet.batch; int y_size = get_network_output_size(fnet)*fnet.batch; fstate.input = cuda_make_array(0, x_size); fstate.truth = cuda_make_array(0, y_size); fstate.delta = cuda_make_array(0, x_size); fstate.train = 1; float *X = (float*)calloc(x_size, sizeof(float)); float *y = (float*)calloc(y_size, sizeof(float)); float *ones = cuda_make_array(0, anet.batch); float *zeros = cuda_make_array(0, anet.batch); fill_ongpu(anet.batch, .99, ones, 1); fill_ongpu(anet.batch, .01, zeros, 1); network_state astate = {}; astate.index = 0; astate.net = anet; int ax_size = get_network_input_size(anet)*anet.batch; int ay_size = get_network_output_size(anet)*anet.batch; astate.input = 0; astate.truth = ones; astate.delta = cuda_make_array(0, ax_size); astate.train = 1; network_state gstate = {}; gstate.index = 0; gstate.net = gnet; int gx_size = get_network_input_size(gnet)*gnet.batch; int gy_size = get_network_output_size(gnet)*gnet.batch; gstate.input = cuda_make_array(0, gx_size); gstate.truth = 0; gstate.delta = 0; gstate.train = 1; while (get_current_batch(gnet) < gnet.max_batches) { i += 1; time=clock(); pthread_join(tload_thread, 0); pthread_join(sload_thread, 0); train = tbuffer; style = sbuffer; tload_thread = load_data_in_thread(targs); sload_thread = load_data_in_thread(sargs); printf("Loaded: %lf seconds\n", sec(clock()-time)); data generated = copy_data(train); time=clock(); int j, k; float floss = 0; for(j = 0; j < fnet.subdivisions; ++j){ layer imlayer = gnet.layers[gnet.n - 1]; get_next_batch(train, fnet.batch, j*fnet.batch, X, y); cuda_push_array(fstate.input, X, x_size); cuda_push_array(gstate.input, X, gx_size); *gnet.seen += gnet.batch; forward_network_gpu(fnet, fstate); float *feats = fnet.layers[fnet.n - 2].output_gpu; copy_ongpu(y_size, feats, 1, fstate.truth, 1); forward_network_gpu(gnet, gstate); float *gen = gnet.layers[gnet.n-1].output_gpu; copy_ongpu(x_size, gen, 1, fstate.input, 1); fill_ongpu(x_size, 0, fstate.delta, 1); forward_network_gpu(fnet, fstate); backward_network_gpu(fnet, fstate); //HERE astate.input = gen; fill_ongpu(ax_size, 0, astate.delta, 1); forward_network_gpu(anet, astate); backward_network_gpu(anet, astate); float *delta = imlayer.delta_gpu; fill_ongpu(x_size, 0, delta, 1); scal_ongpu(x_size, 100, astate.delta, 1); scal_ongpu(x_size, .00001, fstate.delta, 1); axpy_ongpu(x_size, 1, fstate.delta, 1, delta, 1); axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1); //fill_ongpu(x_size, 0, delta, 1); //cuda_push_array(delta, X, x_size); //axpy_ongpu(x_size, -1, imlayer.output_gpu, 1, delta, 1); //printf("pix error: %f\n", cuda_mag_array(delta, x_size)); printf("fea error: %f\n", cuda_mag_array(fstate.delta, x_size)); printf("adv error: %f\n", cuda_mag_array(astate.delta, x_size)); //axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1); backward_network_gpu(gnet, gstate); floss += get_network_cost(fnet) /(fnet.subdivisions*fnet.batch); cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); for(k = 0; k < gnet.batch; ++k){ int index = j*gnet.batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1); generated.y.vals[index][0] = .01; } } /* image sim = float_to_image(anet.w, anet.h, anet.c, style.X.vals[j]); show_image(sim, "style"); cvWaitKey(0); */ harmless_update_network_gpu(anet); data merge = concat_data(style, generated); randomize_data(merge); float aloss = train_network(anet, merge); update_network_gpu(gnet); free_data(merge); free_data(train); free_data(generated); free_data(style); if (aloss_avg < 0) aloss_avg = aloss; if (floss_avg < 0) floss_avg = floss; aloss_avg = aloss_avg*.9 + aloss*.1; floss_avg = floss_avg*.9 + floss*.1; printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, floss, aloss, floss_avg, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, gbase, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, gbase); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } #endif }
void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfile, int clear) { #ifdef GPU char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; char *backup_directory = "/home/pjreddie/backup/"; srand(time(0)); char *base = basecfg(cfgfile); printf("%s\n", base); network net = parse_network_cfg(cfgfile); if(weightfile){ load_weights(&net, weightfile); } if(clear) *net.seen = 0; char *abase = basecfg(acfgfile); network anet = parse_network_cfg(acfgfile); if(aweightfile){ load_weights(&anet, aweightfile); } if(clear) *anet.seen = 0; int i, j, k; layer imlayer = {}; for (i = 0; i < net.n; ++i) { if (net.layers[i].out_c == 3) { imlayer = net.layers[i]; break; } } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int imgs = net.batch*net.subdivisions; i = *net.seen/imgs; data train, buffer; list *plist = get_paths(train_images); //int N = plist->size; char **paths = (char **)list_to_array(plist); load_args args = {}; args.w = net.w; args.h = net.h; args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.min = net.min_crop; args.max = net.max_crop; args.angle = net.angle; args.aspect = net.aspect; args.exposure = net.exposure; args.saturation = net.saturation; args.hue = net.hue; args.size = net.w; args.type = CLASSIFICATION_DATA; args.classes = 1; char *ls[1] = {"coco"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; network_state gstate = {}; gstate.index = 0; gstate.net = net; int x_size = get_network_input_size(net)*net.batch; int y_size = 1*net.batch; gstate.input = cuda_make_array(0, x_size); gstate.truth = 0; gstate.delta = 0; gstate.train = 1; float *X = (float*)calloc(x_size, sizeof(float)); float *y = (float*)calloc(y_size, sizeof(float)); network_state astate = {}; astate.index = 0; astate.net = anet; int ay_size = get_network_output_size(anet)*anet.batch; astate.input = 0; astate.truth = 0; astate.delta = 0; astate.train = 1; float *imerror = cuda_make_array(0, imlayer.outputs); float *ones_gpu = cuda_make_array(0, ay_size); fill_ongpu(ay_size, 1, ones_gpu, 1); float aloss_avg = -1; float gloss_avg = -1; //data generated = copy_data(train); while (get_current_batch(net) < net.max_batches) { i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data generated = copy_data(train); time=clock(); float gloss = 0; for(j = 0; j < net.subdivisions; ++j){ get_next_batch(train, net.batch, j*net.batch, X, y); cuda_push_array(gstate.input, X, x_size); *net.seen += net.batch; forward_network_gpu(net, gstate); fill_ongpu(imlayer.outputs, 0, imerror, 1); astate.input = imlayer.output_gpu; astate.delta = imerror; astate.truth = ones_gpu; forward_network_gpu(anet, astate); backward_network_gpu(anet, astate); scal_ongpu(imlayer.outputs, 1, imerror, 1); axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1); backward_network_gpu(net, gstate); printf("features %f\n", cuda_mag_array(imlayer.delta_gpu, imlayer.outputs)); printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs)); gloss += get_network_cost(net) /(net.subdivisions*net.batch); cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); for(k = 0; k < net.batch; ++k){ int index = j*net.batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1); generated.y.vals[index][0] = 0; } } harmless_update_network_gpu(anet); data merge = concat_data(train, generated); randomize_data(merge); float aloss = train_network(anet, merge); update_network_gpu(net); update_network_gpu(anet); free_data(merge); free_data(train); free_data(generated); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; gloss_avg = gloss_avg*.9 + gloss*.1; printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs); if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(net, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(net, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(net, buff); #endif }
void pull_batchnorm_layer(layer l) { cuda_pull_array(l.scales_gpu, l.scales, l.c); cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.c); cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.c); }
float * ofxDarknet::get_network_output_layer_gpu(int i) { layer l = net.layers[i]; if(l.type != REGION) cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); return l.output; }
void pull_cost_layer(cost_layer l) { cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); }
void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int layer_num) { network *net = load_network(cfgfile, weightfile, 0); set_batch_network(net, 1); srand(2222222); list *options = read_data_cfg(datacfg); char *name_list = option_find_str(options, "names", 0); if(!name_list) name_list = option_find_str(options, "labels", "data/labels.list"); int top = option_find_int(options, "top", 1); int i = 0; char **names = get_labels(name_list); clock_t time; int *indexes = calloc(top, sizeof(int)); char buff[256]; char *input = buff; while(1){ if(filename){ strncpy(input, filename, 256); }else{ printf("Enter Image Path: "); fflush(stdout); input = fgets(input, 256, stdin); if(!input) return; strtok(input, "\n"); } image orig = load_image_color(input, 0, 0); image r = resize_min(orig, 256); image im = crop_image(r, (r.w - 224 - 1)/2 + 1, (r.h - 224 - 1)/2 + 1, 224, 224); float mean[] = {0.48263312050943, 0.45230225481413, 0.40099074308742}; float std[] = {0.22590347483426, 0.22120921437787, 0.22103996251583}; float var[3]; var[0] = std[0]*std[0]; var[1] = std[1]*std[1]; var[2] = std[2]*std[2]; normalize_cpu(im.data, mean, var, 1, 3, im.w*im.h); float *X = im.data; time=clock(); float *predictions = network_predict(net, X); layer l = net->layers[layer_num]; for(i = 0; i < l.c; ++i){ if(l.rolling_mean) printf("%f %f %f\n", l.rolling_mean[i], l.rolling_variance[i], l.scales[i]); } #ifdef GPU cuda_pull_array(l.output_gpu, l.output, l.outputs); #endif for(i = 0; i < l.outputs; ++i){ printf("%f\n", l.output[i]); } /* printf("\n\nWeights\n"); for(i = 0; i < l.n*l.size*l.size*l.c; ++i){ printf("%f\n", l.filters[i]); } printf("\n\nBiases\n"); for(i = 0; i < l.n; ++i){ printf("%f\n", l.biases[i]); } */ top_predictions(net, top, indexes); printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); for(i = 0; i < top; ++i){ int index = indexes[i]; printf("%s: %f\n", names[index], predictions[index]); } free_image(im); if (filename) break; } }