void backward_iseg_layer_gpu(const layer l, network net) { int b; for (b = 0; b < l.batch; ++b) { //if(l.extra) gradient_array_gpu(l.output_gpu + b*l.outputs + l.classes*l.w*l.h, l.extra*l.w*l.h, LOGISTIC, l.delta_gpu + b*l.outputs + l.classes*l.w*l.h); } axpy_gpu(l.batch * l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1, net.st); }
void backward_shortcut_layer_gpu(const layer l, network net) { gradient_array_gpu(l.output_gpu, l.outputs * l.batch, l.activation, l.delta_gpu, net.st); axpy_gpu(l.outputs * l.batch, l.alpha, l.delta_gpu, 1, net.delta_gpu, 1, net.st); shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, 1, l.beta, net.layers[l.index].delta_gpu, net.st); }
void backward_crnn_layer_gpu(layer l, network net) { network s = net; s.train = net.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); increment_layer(&input_layer, l.steps - 1); increment_layer(&self_layer, l.steps - 1); increment_layer(&output_layer, l.steps - 1); l.state_gpu += l.hidden*l.batch*l.steps; for (i = l.steps-1; i >= 0; --i) { copy_gpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1); axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); s.input_gpu = l.state_gpu; s.delta_gpu = self_layer.delta_gpu; backward_convolutional_layer_gpu(output_layer, s); l.state_gpu -= l.hidden*l.batch; s.input_gpu = l.state_gpu; s.delta_gpu = self_layer.delta_gpu - l.hidden*l.batch; if (i == 0) s.delta_gpu = 0; backward_convolutional_layer_gpu(self_layer, s); copy_gpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); if (i > 0 && l.shortcut) axpy_gpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1); s.input_gpu = net.input_gpu + i*l.inputs*l.batch; if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch; else s.delta_gpu = 0; backward_convolutional_layer_gpu(input_layer, s); increment_layer(&input_layer, -1); increment_layer(&self_layer, -1); increment_layer(&output_layer, -1); } }
void forward_crnn_layer_gpu(layer l, network net) { network s = net; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); fill_gpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); fill_gpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1); if(net.train) fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1); for (i = 0; i < l.steps; ++i) { s.input_gpu = net.input_gpu; forward_convolutional_layer_gpu(input_layer, s); s.input_gpu = l.state_gpu; forward_convolutional_layer_gpu(self_layer, s); float *old_state = l.state_gpu; if(net.train) l.state_gpu += l.hidden*l.batch; if(l.shortcut){ copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1); }else{ fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1); } axpy_gpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); s.input_gpu = l.state_gpu; forward_convolutional_layer_gpu(output_layer, s); net.input_gpu += l.inputs*l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } }
int main(int argc, char** argv) { size_t pow = read_arg(argc, argv, 1, 16); size_t n = 1 << pow; std::cout << "memcopy and daxpy test of size " << n << "\n"; double* x = malloc_host<double>(n, 1.5); double* y = malloc_host<double>(n, 3.0); // use dummy fields to avoid cache effects, which make results harder to // interpret use 1<<24 to ensure that cache is completely purged for all n double* x_ = malloc_host<double>(n, 1.5); double* y_ = malloc_host<double>(n, 3.0); // openmp version: auto start = get_time(); axpy(n, 2.0, x_, y_); auto time_axpy_omp = get_time() - start; // openacc version: start = get_time(); axpy_gpu(n, 2.0, x, y); auto time_axpy_gpu = get_time() - start; std::cout << "-------\ntimings\n-------\n"; std::cout << "axpy (openmp): " << time_axpy_omp << " s\n"; std::cout << "axpy (openacc): " << time_axpy_gpu << " s\n"; // check for errors auto errors = 0; #pragma omp parallel for reduction(+:errors) for (auto i = 0; i < n; ++i) { if (std::fabs(6.-y[i]) > 1e-15) { ++errors; } } if (errors > 0) { std::cout << "\n============ FAILED with " << errors << " errors\n"; } else { std::cout << "\n============ PASSED\n"; } free(x); free(y); return 0; }
void backward_lstm_layer_gpu(layer l, network state) { network s = { 0 }; s.train = state.train; int i; layer wf = *(l.wf); layer wi = *(l.wi); layer wg = *(l.wg); layer wo = *(l.wo); layer uf = *(l.uf); layer ui = *(l.ui); layer ug = *(l.ug); layer uo = *(l.uo); increment_layer(&wf, l.steps - 1); increment_layer(&wi, l.steps - 1); increment_layer(&wg, l.steps - 1); increment_layer(&wo, l.steps - 1); increment_layer(&uf, l.steps - 1); increment_layer(&ui, l.steps - 1); increment_layer(&ug, l.steps - 1); increment_layer(&uo, l.steps - 1); state.input_gpu += l.inputs * l.batch * (l.steps - 1); if (state.delta_gpu) state.delta_gpu += l.inputs * l.batch * (l.steps - 1); l.output_gpu += l.outputs * l.batch * (l.steps - 1); l.cell_gpu += l.outputs * l.batch * (l.steps - 1); l.delta_gpu += l.outputs * l.batch * (l.steps - 1); for (i = l.steps - 1; i >= 0; --i) { if (i != 0) copy_gpu(l.outputs * l.batch, l.cell_gpu - l.outputs * l.batch, 1, l.prev_cell_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.cell_gpu, 1, l.c_gpu, 1, state.st); if (i != 0) copy_gpu(l.outputs * l.batch, l.output_gpu - l.outputs * l.batch, 1, l.prev_state_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.output_gpu, 1, l.h_gpu, 1, state.st); l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs * l.batch; copy_gpu(l.outputs * l.batch, wf.output_gpu, 1, l.f_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, wi.output_gpu, 1, l.i_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, wg.output_gpu, 1, l.g_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, wo.output_gpu, 1, l.o_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1, state.st); activate_array_gpu(l.f_gpu, l.outputs * l.batch, LOGISTIC, state.st); activate_array_gpu(l.i_gpu, l.outputs * l.batch, LOGISTIC, state.st); activate_array_gpu(l.g_gpu, l.outputs * l.batch, TANH, state.st); activate_array_gpu(l.o_gpu, l.outputs * l.batch, LOGISTIC, state.st); copy_gpu(l.outputs * l.batch, l.delta_gpu, 1, l.temp3_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.c_gpu, 1, l.temp_gpu, 1, state.st); activate_array_gpu(l.temp_gpu, l.outputs * l.batch, TANH, state.st); copy_gpu(l.outputs * l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1, state.st); mul_gpu(l.outputs * l.batch, l.o_gpu, 1, l.temp2_gpu, 1, state.st); gradient_array_gpu(l.temp_gpu, l.outputs * l.batch, TANH, l.temp2_gpu, state.st); axpy_gpu(l.outputs * l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.c_gpu, 1, l.temp_gpu, 1, state.st); activate_array_gpu(l.temp_gpu, l.outputs * l.batch, TANH, state.st); mul_gpu(l.outputs * l.batch, l.temp3_gpu, 1, l.temp_gpu, 1, state.st); gradient_array_gpu(l.o_gpu, l.outputs * l.batch, LOGISTIC, l.temp_gpu, state.st); copy_gpu(l.outputs * l.batch, l.temp_gpu, 1, wo.delta_gpu, 1, state.st); s.input_gpu = l.prev_state_gpu; s.delta_gpu = l.dh_gpu; backward_connected_layer_gpu(wo, s); copy_gpu(l.outputs * l.batch, l.temp_gpu, 1, uo.delta_gpu, 1, state.st); s.input_gpu = state.input_gpu; s.delta_gpu = state.delta_gpu; backward_connected_layer_gpu(uo, s); copy_gpu(l.outputs * l.batch, l.temp2_gpu, 1, l.temp_gpu, 1, state.st); mul_gpu(l.outputs * l.batch, l.i_gpu, 1, l.temp_gpu, 1, state.st); gradient_array_gpu(l.g_gpu, l.outputs * l.batch, TANH, l.temp_gpu, state.st); copy_gpu(l.outputs * l.batch, l.temp_gpu, 1, wg.delta_gpu, 1, state.st); s.input_gpu = l.prev_state_gpu; s.delta_gpu = l.dh_gpu; backward_connected_layer_gpu(wg, s); copy_gpu(l.outputs * l.batch, l.temp_gpu, 1, ug.delta_gpu, 1, state.st); s.input_gpu = state.input_gpu; s.delta_gpu = state.delta_gpu; backward_connected_layer_gpu(ug, s); copy_gpu(l.outputs * l.batch, l.temp2_gpu, 1, l.temp_gpu, 1, state.st); mul_gpu(l.outputs * l.batch, l.g_gpu, 1, l.temp_gpu, 1, state.st); gradient_array_gpu(l.i_gpu, l.outputs * l.batch, LOGISTIC, l.temp_gpu, state.st); copy_gpu(l.outputs * l.batch, l.temp_gpu, 1, wi.delta_gpu, 1, state.st); s.input_gpu = l.prev_state_gpu; s.delta_gpu = l.dh_gpu; backward_connected_layer_gpu(wi, s); copy_gpu(l.outputs * l.batch, l.temp_gpu, 1, ui.delta_gpu, 1, state.st); s.input_gpu = state.input_gpu; s.delta_gpu = state.delta_gpu; backward_connected_layer_gpu(ui, s); copy_gpu(l.outputs * l.batch, l.temp2_gpu, 1, l.temp_gpu, 1, state.st); mul_gpu(l.outputs * l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1, state.st); gradient_array_gpu(l.f_gpu, l.outputs * l.batch, LOGISTIC, l.temp_gpu, state.st); copy_gpu(l.outputs * l.batch, l.temp_gpu, 1, wf.delta_gpu, 1, state.st); s.input_gpu = l.prev_state_gpu; s.delta_gpu = l.dh_gpu; backward_connected_layer_gpu(wf, s); copy_gpu(l.outputs * l.batch, l.temp_gpu, 1, uf.delta_gpu, 1, state.st); s.input_gpu = state.input_gpu; s.delta_gpu = state.delta_gpu; backward_connected_layer_gpu(uf, s); copy_gpu(l.outputs * l.batch, l.temp2_gpu, 1, l.temp_gpu, 1, state.st); mul_gpu(l.outputs * l.batch, l.f_gpu, 1, l.temp_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.temp_gpu, 1, l.dc_gpu, 1, state.st); state.input_gpu -= l.inputs * l.batch; if (state.delta_gpu) state.delta_gpu -= l.inputs * l.batch; l.output_gpu -= l.outputs * l.batch; l.cell_gpu -= l.outputs * l.batch; l.delta_gpu -= l.outputs * l.batch; increment_layer(&wf, -1); increment_layer(&wi, -1); increment_layer(&wg, -1); increment_layer(&wo, -1); increment_layer(&uf, -1); increment_layer(&ui, -1); increment_layer(&ug, -1); increment_layer(&uo, -1); } }
void forward_lstm_layer_gpu(layer l, network state) { network s = { 0 }; s.train = state.train; int i; layer wf = *(l.wf); layer wi = *(l.wi); layer wg = *(l.wg); layer wo = *(l.wo); layer uf = *(l.uf); layer ui = *(l.ui); layer ug = *(l.ug); layer uo = *(l.uo); fill_gpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1, state.st); if (state.train) { fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1, state.st); } for (i = 0; i < l.steps; ++i) { s.input_gpu = l.h_gpu; forward_connected_layer_gpu(wf, s); forward_connected_layer_gpu(wi, s); forward_connected_layer_gpu(wg, s); forward_connected_layer_gpu(wo, s); s.input_gpu = state.input_gpu; forward_connected_layer_gpu(uf, s); forward_connected_layer_gpu(ui, s); forward_connected_layer_gpu(ug, s); forward_connected_layer_gpu(uo, s); copy_gpu(l.outputs * l.batch, wf.output_gpu, 1, l.f_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, wi.output_gpu, 1, l.i_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, wg.output_gpu, 1, l.g_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, wo.output_gpu, 1, l.o_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1, state.st); activate_array_gpu(l.f_gpu, l.outputs * l.batch, LOGISTIC, state.st); activate_array_gpu(l.i_gpu, l.outputs * l.batch, LOGISTIC, state.st); activate_array_gpu(l.g_gpu, l.outputs * l.batch, TANH, state.st); activate_array_gpu(l.o_gpu, l.outputs * l.batch, LOGISTIC, state.st); copy_gpu(l.outputs * l.batch, l.i_gpu, 1, l.temp_gpu, 1, state.st); mul_gpu(l.outputs * l.batch, l.g_gpu, 1, l.temp_gpu, 1, state.st); mul_gpu(l.outputs * l.batch, l.f_gpu, 1, l.c_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.c_gpu, 1, l.h_gpu, 1, state.st); activate_array_gpu(l.h_gpu, l.outputs * l.batch, TANH, state.st); mul_gpu(l.outputs * l.batch, l.o_gpu, 1, l.h_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.c_gpu, 1, l.cell_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.h_gpu, 1, l.output_gpu, 1, state.st); state.input_gpu += l.inputs * l.batch; l.output_gpu += l.outputs * l.batch; l.cell_gpu += l.outputs * l.batch; increment_layer(&wf, 1); increment_layer(&wi, 1); increment_layer(&wg, 1); increment_layer(&wo, 1); increment_layer(&uf, 1); increment_layer(&ui, 1); increment_layer(&ug, 1); increment_layer(&uo, 1); } }
void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display) { #ifdef GPU //char *train_images = "/home/kunle12/data/coco/train1.txt"; //char *train_images = "/home/kunle12/data/coco/trainvalno5k.txt"; char *train_images = "/home/kunle12/data/imagenet/imagenet1k.train.list"; char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *net = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); int i, j, k; layer imlayer = {0}; for (i = 0; i < net->n; ++i) { if (net->layers[i].out_c == 3) { imlayer = net->layers[i]; break; } } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); int imgs = net->batch*net->subdivisions; i = *net->seen/imgs; data train, buffer; list *plist = get_paths(train_images); //int N = plist->size; char **paths = (char **)list_to_array(plist); load_args args= get_base_args(net); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.classes = 1; char *ls[2] = {"imagenet"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; int x_size = net->inputs*net->batch; //int y_size = x_size; net->delta = 0; net->train = 1; float *pixs = calloc(x_size, sizeof(float)); float *graypixs = calloc(x_size, sizeof(float)); //float *y = calloc(y_size, sizeof(float)); //int ay_size = anet->outputs*anet->batch; anet->delta = 0; anet->train = 1; float *imerror = cuda_make_array(0, imlayer.outputs*imlayer.batch); float aloss_avg = -1; float gloss_avg = -1; //data generated = copy_data(train); while (get_current_batch(net) < net->max_batches) { i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gray = copy_data(train); for(j = 0; j < imgs; ++j){ image gim = float_to_image(net->w, net->h, net->c, gray.X.vals[j]); grayscale_image_3c(gim); train.y.vals[j][0] = .95; gray.y.vals[j][0] = .05; } time=clock(); float gloss = 0; for(j = 0; j < net->subdivisions; ++j){ get_next_batch(train, net->batch, j*net->batch, pixs, 0); get_next_batch(gray, net->batch, j*net->batch, graypixs, 0); cuda_push_array(net->input_gpu, graypixs, net->inputs*net->batch); cuda_push_array(net->truth_gpu, pixs, net->truths*net->batch); /* image origi = float_to_image(net->w, net->h, 3, pixs); image grayi = float_to_image(net->w, net->h, 3, graypixs); show_image(grayi, "gray"); show_image(origi, "orig"); cvWaitKey(0); */ *net->seen += net->batch; forward_network_gpu(net); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); copy_gpu(anet->inputs*anet->batch, imlayer.output_gpu, 1, anet->input_gpu, 1); fill_gpu(anet->inputs*anet->batch, .95, anet->truth_gpu, 1); anet->delta_gpu = imerror; forward_network_gpu(anet); backward_network_gpu(anet); scal_gpu(imlayer.outputs*imlayer.batch, 1./100., net->layers[net->n-1].delta_gpu, 1); scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); printf("features %f\n", cuda_mag_array(net->layers[net->n-1].delta_gpu, imlayer.outputs*imlayer.batch)); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net->layers[net->n-1].delta_gpu, 1); backward_network_gpu(net); gloss += *net->cost /(net->subdivisions*net->batch); for(k = 0; k < net->batch; ++k){ int index = j*net->batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gray); //randomize_data(merge); float aloss = train_network(anet, merge); update_network_gpu(net); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gray.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); } #endif free_data(merge); free_data(train); free_data(gray); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; gloss_avg = gloss_avg*.9 + gloss*.1; printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs); if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(net, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(net, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(net, buff); #endif }
void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch) { #ifdef GPU char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *gnet = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); //float orig_rate = anet->learning_rate; int i, j, k; layer imlayer = {0}; for (i = 0; i < gnet->n; ++i) { if (gnet->layers[i].out_c == 3) { imlayer = gnet->layers[i]; break; } } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay); int imgs = gnet->batch*gnet->subdivisions; i = *gnet->seen/imgs; data train, buffer; list *plist = get_paths(train_images); //int N = plist->size; char **paths = (char **)list_to_array(plist); load_args args= get_base_args(anet); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.threads=16; args.classes = 1; char *ls[2] = {"imagenet", "zzzzzzzz"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; gnet->train = 1; anet->train = 1; int x_size = gnet->inputs*gnet->batch; int y_size = gnet->truths*gnet->batch; float *imerror = cuda_make_array(0, y_size); //int ay_size = anet->truths*anet->batch; float aloss_avg = -1; //data generated = copy_data(train); if (maxbatch == 0) maxbatch = gnet->max_batches; while (get_current_batch(gnet) < maxbatch) { i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; //translate_data_rows(train, -.5); //scale_data_rows(train, 2); load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gen = copy_data(train); for (j = 0; j < imgs; ++j) { train.y.vals[j][0] = 1; gen.y.vals[j][0] = 0; } time=clock(); for(j = 0; j < gnet->subdivisions; ++j){ get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0); int z; for(z = 0; z < x_size; ++z){ gnet->input[z] = rand_normal(); } for(z = 0; z < gnet->batch; ++z){ float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs); scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag); } /* for(z = 0; z < 100; ++z){ printf("%f, ", gnet->input[z]); } printf("\n"); printf("input: %f %f\n", mean_array(gnet->input, x_size), variance_array(gnet->input, x_size)); */ //cuda_push_array(gnet->input_gpu, gnet->input, x_size); //cuda_push_array(gnet->truth_gpu, gnet->truth, y_size); *gnet->seen += gnet->batch; forward_network(gnet); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1); copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1); anet->delta_gpu = imerror; forward_network(anet); backward_network(anet); //float genaloss = *anet->cost / anet->batch; //printf("%f\n", genaloss); scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1); //printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); //printf("features %f\n", cuda_mag_array(gnet->layers[gnet->n-1].delta_gpu, imlayer.outputs*imlayer.batch)); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1); backward_network(gnet); /* for(k = 0; k < gnet->n; ++k){ layer l = gnet->layers[k]; cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); printf("%d: %f %f\n", k, mean_array(l.output, l.outputs*l.batch), variance_array(l.output, l.outputs*l.batch)); } */ for(k = 0; k < gnet->batch; ++k){ int index = j*gnet->batch + k; copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gen); //randomize_data(merge); float aloss = train_network(anet, merge); //translate_image(im, 1); //scale_image(im, .5); //translate_image(im2, 1); //scale_image(im2, .5); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); save_image(im, "gen"); save_image(im2, "train"); } #endif /* if(aloss < .1){ anet->learning_rate = 0; } else if (aloss > .3){ anet->learning_rate = orig_rate; } */ update_network_gpu(gnet); free_data(merge); free_data(train); free_data(gen); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%10000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(gnet, buff); #endif free_network(gnet); free_network(anet); }
void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch) { #ifdef GPU char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *gnet = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); int i, j, k; layer imlayer = gnet->layers[gnet->n-1]; printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay); int imgs = gnet->batch*gnet->subdivisions; i = *gnet->seen/imgs; data train, buffer; list *plist = get_paths(train_images); char **paths = (char **)list_to_array(plist); load_args args= get_base_args(anet); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.threads=16; args.classes = 1; char *ls[2] = {"imagenet", "zzzzzzzz"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; gnet->train = 1; anet->train = 1; int x_size = gnet->inputs*gnet->batch; int y_size = gnet->truths*gnet->batch; float *imerror = cuda_make_array(0, y_size); float aloss_avg = -1; if (maxbatch == 0) maxbatch = gnet->max_batches; while (get_current_batch(gnet) < maxbatch) { { int cb = get_current_batch(gnet); float alpha = (float) cb / (maxbatch/2); if(alpha > 1) alpha = 1; float beta = 1 - alpha; printf("%f %f\n", alpha, beta); set_network_alpha_beta(gnet, alpha, beta); set_network_alpha_beta(anet, beta, alpha); } i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gen = copy_data(train); for (j = 0; j < imgs; ++j) { train.y.vals[j][0] = 1; gen.y.vals[j][0] = 0; } time=clock(); for (j = 0; j < gnet->subdivisions; ++j) { get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0); int z; for(z = 0; z < x_size; ++z){ gnet->input[z] = rand_normal(); } /* for(z = 0; z < gnet->batch; ++z){ float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs); scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag); } */ *gnet->seen += gnet->batch; forward_network(gnet); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1); copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1); anet->delta_gpu = imerror; forward_network(anet); backward_network(anet); //float genaloss = *anet->cost / anet->batch; scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1); backward_network(gnet); for(k = 0; k < gnet->batch; ++k){ int index = j*gnet->batch + k; copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gen); float aloss = train_network(anet, merge); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); save_image(im, "gen"); save_image(im2, "train"); } #endif update_network_gpu(gnet); free_data(merge); free_data(train); free_data(gen); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%10000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(gnet, buff); #endif free_network( gnet ); free_network( anet ); }
void backward_detection_layer_gpu(detection_layer l, network net) { axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1); //copy_gpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1); }
void backward_l2norm_layer_gpu(const layer l, network * net) { axpy_gpu(l.batch*l.inputs, 1, l.scales_gpu, 1, l.delta_gpu, 1); axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net->delta_gpu, 1); }
void backward_cost_layer_gpu(const cost_layer l, network * net) { axpy_gpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net->delta_gpu, 1); }