void forward_crnn_layer_gpu(layer l, network net) { network s = net; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); fill_gpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); fill_gpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1); if(net.train) fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1); for (i = 0; i < l.steps; ++i) { s.input_gpu = net.input_gpu; forward_convolutional_layer_gpu(input_layer, s); s.input_gpu = l.state_gpu; forward_convolutional_layer_gpu(self_layer, s); float *old_state = l.state_gpu; if(net.train) l.state_gpu += l.hidden*l.batch; if(l.shortcut){ copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1); }else{ fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1); } axpy_gpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); s.input_gpu = l.state_gpu; forward_convolutional_layer_gpu(output_layer, s); net.input_gpu += l.inputs*l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } }
float *cuda_make_array(float *x, size_t n) { float *x_gpu; size_t size = sizeof(float)*n; cudaError_t status = cudaMalloc((void **)&x_gpu, size); check_error(status); if(x){ status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); check_error(status); } else { fill_gpu(n, 0, x_gpu, 1); } if(!x_gpu) error("Cuda malloc failed\n"); return x_gpu; }
void forward_lstm_layer_gpu(layer l, network state) { network s = { 0 }; s.train = state.train; int i; layer wf = *(l.wf); layer wi = *(l.wi); layer wg = *(l.wg); layer wo = *(l.wo); layer uf = *(l.uf); layer ui = *(l.ui); layer ug = *(l.ug); layer uo = *(l.uo); fill_gpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1, state.st); fill_gpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1, state.st); if (state.train) { fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1, state.st); } for (i = 0; i < l.steps; ++i) { s.input_gpu = l.h_gpu; forward_connected_layer_gpu(wf, s); forward_connected_layer_gpu(wi, s); forward_connected_layer_gpu(wg, s); forward_connected_layer_gpu(wo, s); s.input_gpu = state.input_gpu; forward_connected_layer_gpu(uf, s); forward_connected_layer_gpu(ui, s); forward_connected_layer_gpu(ug, s); forward_connected_layer_gpu(uo, s); copy_gpu(l.outputs * l.batch, wf.output_gpu, 1, l.f_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, wi.output_gpu, 1, l.i_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, wg.output_gpu, 1, l.g_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, wo.output_gpu, 1, l.o_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1, state.st); activate_array_gpu(l.f_gpu, l.outputs * l.batch, LOGISTIC, state.st); activate_array_gpu(l.i_gpu, l.outputs * l.batch, LOGISTIC, state.st); activate_array_gpu(l.g_gpu, l.outputs * l.batch, TANH, state.st); activate_array_gpu(l.o_gpu, l.outputs * l.batch, LOGISTIC, state.st); copy_gpu(l.outputs * l.batch, l.i_gpu, 1, l.temp_gpu, 1, state.st); mul_gpu(l.outputs * l.batch, l.g_gpu, 1, l.temp_gpu, 1, state.st); mul_gpu(l.outputs * l.batch, l.f_gpu, 1, l.c_gpu, 1, state.st); axpy_gpu(l.outputs * l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.c_gpu, 1, l.h_gpu, 1, state.st); activate_array_gpu(l.h_gpu, l.outputs * l.batch, TANH, state.st); mul_gpu(l.outputs * l.batch, l.o_gpu, 1, l.h_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.c_gpu, 1, l.cell_gpu, 1, state.st); copy_gpu(l.outputs * l.batch, l.h_gpu, 1, l.output_gpu, 1, state.st); state.input_gpu += l.inputs * l.batch; l.output_gpu += l.outputs * l.batch; l.cell_gpu += l.outputs * l.batch; increment_layer(&wf, 1); increment_layer(&wi, 1); increment_layer(&wg, 1); increment_layer(&wo, 1); increment_layer(&uf, 1); increment_layer(&ui, 1); increment_layer(&ug, 1); increment_layer(&uo, 1); } }
int main(int argc, char** argv) { // set up parameters // first argument is the y dimension = 2^arg size_t pow = read_arg(argc, argv, 1, 8); // second argument is the number of time steps size_t nsteps = read_arg(argc, argv, 2, 100); // third argument is nonzero if shared memory version is to be used bool use_shared = read_arg(argc, argv, 3, 0); // set domain size size_t nx = 128+2; size_t ny = (1 << pow)+2; double dt = 0.1; std::cout << "\n## " << nx << "x" << ny << " for " << nsteps << " time steps" << " (" << nx*ny << " grid points)\n"; // allocate memory on device and host // note : allocate enough memory for the halo around the boundary auto buffer_size = nx*ny; #ifdef OPENACC_DATA double *x0 = malloc_host_pinned<double>(buffer_size); double *x1 = malloc_host_pinned<double>(buffer_size); #else double *x_host = malloc_host_pinned<double>(buffer_size); double *x0 = malloc_device<double>(buffer_size); double *x1 = malloc_device<double>(buffer_size); #endif double start_diffusion, time_diffusion; #ifdef OPENACC_DATA #pragma acc data create(x0[0:buffer_size]) copyout(x1[0:buffer_size]) #endif { // set initial conditions of 0 everywhere fill_gpu(x0, 0., buffer_size); fill_gpu(x1, 0., buffer_size); // set boundary conditions of 1 on south border fill_gpu(x0, 1., nx); fill_gpu(x1, 1., nx); fill_gpu(x0+nx*(ny-1), 1., nx); fill_gpu(x1+nx*(ny-1), 1., nx); // time stepping loop #pragma acc wait start_diffusion = get_time(); for(auto step=0; step<nsteps; ++step) { diffusion_gpu(x0, x1, nx-2, ny-2, dt); #ifdef OPENACC_DATA copy_gpu(x0, x1, buffer_size); #else std::swap(x0, x1); #endif } #pragma acc wait time_diffusion = get_time() - start_diffusion; } // end of acc data #ifdef OPENACC_DATA auto x_res = x1; #else copy_to_host<double>(x0, x_host, buffer_size); auto x_res = x_host; #endif std::cout << "## " << time_diffusion << "s, " << nsteps*(nx-2)*(ny-2) / time_diffusion << " points/second\n\n"; std::cout << "writing to output.bin/bov\n"; write_to_file(nx, ny, x_res); return 0; }
void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display) { #ifdef GPU //char *train_images = "/home/kunle12/data/coco/train1.txt"; //char *train_images = "/home/kunle12/data/coco/trainvalno5k.txt"; char *train_images = "/home/kunle12/data/imagenet/imagenet1k.train.list"; char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *net = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); int i, j, k; layer imlayer = {0}; for (i = 0; i < net->n; ++i) { if (net->layers[i].out_c == 3) { imlayer = net->layers[i]; break; } } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); int imgs = net->batch*net->subdivisions; i = *net->seen/imgs; data train, buffer; list *plist = get_paths(train_images); //int N = plist->size; char **paths = (char **)list_to_array(plist); load_args args= get_base_args(net); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.classes = 1; char *ls[2] = {"imagenet"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; int x_size = net->inputs*net->batch; //int y_size = x_size; net->delta = 0; net->train = 1; float *pixs = calloc(x_size, sizeof(float)); float *graypixs = calloc(x_size, sizeof(float)); //float *y = calloc(y_size, sizeof(float)); //int ay_size = anet->outputs*anet->batch; anet->delta = 0; anet->train = 1; float *imerror = cuda_make_array(0, imlayer.outputs*imlayer.batch); float aloss_avg = -1; float gloss_avg = -1; //data generated = copy_data(train); while (get_current_batch(net) < net->max_batches) { i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gray = copy_data(train); for(j = 0; j < imgs; ++j){ image gim = float_to_image(net->w, net->h, net->c, gray.X.vals[j]); grayscale_image_3c(gim); train.y.vals[j][0] = .95; gray.y.vals[j][0] = .05; } time=clock(); float gloss = 0; for(j = 0; j < net->subdivisions; ++j){ get_next_batch(train, net->batch, j*net->batch, pixs, 0); get_next_batch(gray, net->batch, j*net->batch, graypixs, 0); cuda_push_array(net->input_gpu, graypixs, net->inputs*net->batch); cuda_push_array(net->truth_gpu, pixs, net->truths*net->batch); /* image origi = float_to_image(net->w, net->h, 3, pixs); image grayi = float_to_image(net->w, net->h, 3, graypixs); show_image(grayi, "gray"); show_image(origi, "orig"); cvWaitKey(0); */ *net->seen += net->batch; forward_network_gpu(net); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); copy_gpu(anet->inputs*anet->batch, imlayer.output_gpu, 1, anet->input_gpu, 1); fill_gpu(anet->inputs*anet->batch, .95, anet->truth_gpu, 1); anet->delta_gpu = imerror; forward_network_gpu(anet); backward_network_gpu(anet); scal_gpu(imlayer.outputs*imlayer.batch, 1./100., net->layers[net->n-1].delta_gpu, 1); scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); printf("features %f\n", cuda_mag_array(net->layers[net->n-1].delta_gpu, imlayer.outputs*imlayer.batch)); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net->layers[net->n-1].delta_gpu, 1); backward_network_gpu(net); gloss += *net->cost /(net->subdivisions*net->batch); for(k = 0; k < net->batch; ++k){ int index = j*net->batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gray); //randomize_data(merge); float aloss = train_network(anet, merge); update_network_gpu(net); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gray.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); } #endif free_data(merge); free_data(train); free_data(gray); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; gloss_avg = gloss_avg*.9 + gloss*.1; printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs); if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(net, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(net, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(net, buff); #endif }
void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch) { #ifdef GPU char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *gnet = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); //float orig_rate = anet->learning_rate; int i, j, k; layer imlayer = {0}; for (i = 0; i < gnet->n; ++i) { if (gnet->layers[i].out_c == 3) { imlayer = gnet->layers[i]; break; } } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay); int imgs = gnet->batch*gnet->subdivisions; i = *gnet->seen/imgs; data train, buffer; list *plist = get_paths(train_images); //int N = plist->size; char **paths = (char **)list_to_array(plist); load_args args= get_base_args(anet); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.threads=16; args.classes = 1; char *ls[2] = {"imagenet", "zzzzzzzz"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; gnet->train = 1; anet->train = 1; int x_size = gnet->inputs*gnet->batch; int y_size = gnet->truths*gnet->batch; float *imerror = cuda_make_array(0, y_size); //int ay_size = anet->truths*anet->batch; float aloss_avg = -1; //data generated = copy_data(train); if (maxbatch == 0) maxbatch = gnet->max_batches; while (get_current_batch(gnet) < maxbatch) { i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; //translate_data_rows(train, -.5); //scale_data_rows(train, 2); load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gen = copy_data(train); for (j = 0; j < imgs; ++j) { train.y.vals[j][0] = 1; gen.y.vals[j][0] = 0; } time=clock(); for(j = 0; j < gnet->subdivisions; ++j){ get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0); int z; for(z = 0; z < x_size; ++z){ gnet->input[z] = rand_normal(); } for(z = 0; z < gnet->batch; ++z){ float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs); scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag); } /* for(z = 0; z < 100; ++z){ printf("%f, ", gnet->input[z]); } printf("\n"); printf("input: %f %f\n", mean_array(gnet->input, x_size), variance_array(gnet->input, x_size)); */ //cuda_push_array(gnet->input_gpu, gnet->input, x_size); //cuda_push_array(gnet->truth_gpu, gnet->truth, y_size); *gnet->seen += gnet->batch; forward_network(gnet); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1); copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1); anet->delta_gpu = imerror; forward_network(anet); backward_network(anet); //float genaloss = *anet->cost / anet->batch; //printf("%f\n", genaloss); scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1); //printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); //printf("features %f\n", cuda_mag_array(gnet->layers[gnet->n-1].delta_gpu, imlayer.outputs*imlayer.batch)); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1); backward_network(gnet); /* for(k = 0; k < gnet->n; ++k){ layer l = gnet->layers[k]; cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); printf("%d: %f %f\n", k, mean_array(l.output, l.outputs*l.batch), variance_array(l.output, l.outputs*l.batch)); } */ for(k = 0; k < gnet->batch; ++k){ int index = j*gnet->batch + k; copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gen); //randomize_data(merge); float aloss = train_network(anet, merge); //translate_image(im, 1); //scale_image(im, .5); //translate_image(im2, 1); //scale_image(im2, .5); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); save_image(im, "gen"); save_image(im2, "train"); } #endif /* if(aloss < .1){ anet->learning_rate = 0; } else if (aloss > .3){ anet->learning_rate = orig_rate; } */ update_network_gpu(gnet); free_data(merge); free_data(train); free_data(gen); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%10000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(gnet, buff); #endif free_network(gnet); free_network(anet); }
void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch) { #ifdef GPU char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *gnet = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); int i, j, k; layer imlayer = gnet->layers[gnet->n-1]; printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay); int imgs = gnet->batch*gnet->subdivisions; i = *gnet->seen/imgs; data train, buffer; list *plist = get_paths(train_images); char **paths = (char **)list_to_array(plist); load_args args= get_base_args(anet); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.threads=16; args.classes = 1; char *ls[2] = {"imagenet", "zzzzzzzz"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; gnet->train = 1; anet->train = 1; int x_size = gnet->inputs*gnet->batch; int y_size = gnet->truths*gnet->batch; float *imerror = cuda_make_array(0, y_size); float aloss_avg = -1; if (maxbatch == 0) maxbatch = gnet->max_batches; while (get_current_batch(gnet) < maxbatch) { { int cb = get_current_batch(gnet); float alpha = (float) cb / (maxbatch/2); if(alpha > 1) alpha = 1; float beta = 1 - alpha; printf("%f %f\n", alpha, beta); set_network_alpha_beta(gnet, alpha, beta); set_network_alpha_beta(anet, beta, alpha); } i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gen = copy_data(train); for (j = 0; j < imgs; ++j) { train.y.vals[j][0] = 1; gen.y.vals[j][0] = 0; } time=clock(); for (j = 0; j < gnet->subdivisions; ++j) { get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0); int z; for(z = 0; z < x_size; ++z){ gnet->input[z] = rand_normal(); } /* for(z = 0; z < gnet->batch; ++z){ float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs); scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag); } */ *gnet->seen += gnet->batch; forward_network(gnet); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1); copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1); anet->delta_gpu = imerror; forward_network(anet); backward_network(anet); //float genaloss = *anet->cost / anet->batch; scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1); backward_network(gnet); for(k = 0; k < gnet->batch; ++k){ int index = j*gnet->batch + k; copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gen); float aloss = train_network(anet, merge); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); save_image(im, "gen"); save_image(im2, "train"); } #endif update_network_gpu(gnet); free_data(merge); free_data(train); free_data(gen); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%10000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(gnet, buff); #endif free_network( gnet ); free_network( anet ); }
int main(int argc, char** argv) { // set up parameters // first argument is the y dimension = 2^arg size_t pow = read_arg(argc, argv, 1, 8); // second argument is the number of time steps size_t nsteps = read_arg(argc, argv, 2, 100); // third argument is nonzero if shared memory version is to be used bool use_shared = read_arg(argc, argv, 3, 0); // set domain size size_t nx = 128; size_t ny = 1 << pow; double dt = 0.1; // initialize MPI int mpi_rank, mpi_size; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); if (ny % mpi_size) { std::cout << "error : global domain dimension " << ny << "must be divisible by number of MPI ranks " << mpi_size << "\n"; exit(1); } else if (mpi_rank == 0) { std::cout << "\n## " << mpi_size << " MPI ranks" << std::endl; std::cout << "## " << nx << "x" << ny << " : " << nx << "x" << ny/mpi_size << " per rank" << " for " << nsteps << " time steps" << " (" << nx*ny << " grid points)\n"; } ny /= mpi_size; // adjust dimensions for halo nx += 2; ny += 2; // allocate memory on device and host // note : allocate enough memory for the halo around the boundary auto buffer_size = nx*ny; #ifdef OPENACC_DATA double *x0 = malloc_host_pinned<double>(buffer_size); double *x1 = malloc_host_pinned<double>(buffer_size); #else double *x_host = (double *) malloc(buffer_size*sizeof(double)); // double *x_host = malloc_host_pinned<double>(buffer_size); double *x0 = malloc_device<double>(buffer_size); double *x1 = malloc_device<double>(buffer_size); #endif double start_diffusion, time_diffusion; #ifdef OPENACC_DATA // TODO: move data to the GPU #endif { // set initial conditions of 0 everywhere fill_gpu(x0, 0., buffer_size); fill_gpu(x1, 0., buffer_size); // set boundary conditions of 1 on south border if (mpi_rank == 0) { fill_gpu(x0, 1., nx); fill_gpu(x1, 1., nx); } if (mpi_rank == mpi_size-1) { fill_gpu(x0+nx*(ny-1), 1., nx); fill_gpu(x1+nx*(ny-1), 1., nx); } auto south = mpi_rank - 1; auto north = mpi_rank + 1; // time stepping loop #pragma acc wait start_diffusion = get_time(); for(auto step=0; step<nsteps; ++step) { MPI_Request requests[4]; MPI_Status statuses[4]; auto num_requests = 0; #ifdef OPENACC_DATA // TODO: There are two ways to communicate: // 1. Update the host copy first and then communicate // 2. Use the optimised RDMA data path #endif { if (south >= 0) { // x0(:, 0) <- south MPI_Irecv(x0, nx, MPI_DOUBLE, south, 0, MPI_COMM_WORLD, &requests[0]); // x0(:, 1) -> south MPI_Isend(x0+nx, nx, MPI_DOUBLE, south, 0, MPI_COMM_WORLD, &requests[1]); num_requests += 2; } // exchange with north if(north < mpi_size) { // x0(:, ny-1) <- north MPI_Irecv(x0+(ny-1)*nx, nx, MPI_DOUBLE, north, 0, MPI_COMM_WORLD, &requests[num_requests]); // x0(:, ny-2) -> north MPI_Isend(x0+(ny-2)*nx, nx, MPI_DOUBLE, north, 0, MPI_COMM_WORLD, &requests[num_requests+1]); num_requests += 2; } } MPI_Waitall(num_requests, requests, statuses); diffusion_gpu(x0, x1, nx-2, ny-2, dt); #ifdef OPENACC_DATA copy_gpu(x0, x1, buffer_size); #else std::swap(x0, x1); #endif } #pragma acc wait time_diffusion = get_time() - start_diffusion; } // end of acc data #ifdef OPENACC_DATA auto x_res = x1; #else copy_to_host<double>(x0, x_host, buffer_size); auto x_res = x_host; #endif if (mpi_rank == 0) { std::cout << "## " << time_diffusion << "s, " << nsteps*(nx-2)*(ny-2)*mpi_size / time_diffusion << " points/second\n\n"; std::cout << "writing to output.bin/bov\n"; write_to_file(nx, ny, x_res); } MPI_Finalize(); return 0; }