Exemple #1
0
void forward_crnn_layer_gpu(layer l, network net)
{
    network s = net;
    int i;
    layer input_layer = *(l.input_layer);
    layer self_layer = *(l.self_layer);
    layer output_layer = *(l.output_layer);

    fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
    fill_gpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
    fill_gpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
    if(net.train) fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);

    for (i = 0; i < l.steps; ++i) {
        s.input_gpu = net.input_gpu;
        forward_convolutional_layer_gpu(input_layer, s);

        s.input_gpu = l.state_gpu;
        forward_convolutional_layer_gpu(self_layer, s);

        float *old_state = l.state_gpu;
        if(net.train) l.state_gpu += l.hidden*l.batch;
        if(l.shortcut){
            copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
        }else{
            fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
        }
        axpy_gpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
        axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);

        s.input_gpu = l.state_gpu;
        forward_convolutional_layer_gpu(output_layer, s);

        net.input_gpu += l.inputs*l.batch;
        increment_layer(&input_layer, 1);
        increment_layer(&self_layer, 1);
        increment_layer(&output_layer, 1);
    }
}
Exemple #2
0
float *cuda_make_array(float *x, size_t n)
{
    float *x_gpu;
    size_t size = sizeof(float)*n;
    cudaError_t status = cudaMalloc((void **)&x_gpu, size);
    check_error(status);
    if(x){
        status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
        check_error(status);
    } else {
        fill_gpu(n, 0, x_gpu, 1);
    }
    if(!x_gpu) error("Cuda malloc failed\n");
    return x_gpu;
}
void forward_lstm_layer_gpu(layer l, network state) {
	network s = { 0 };
	s.train = state.train;
	int i;
	layer wf = *(l.wf);
	layer wi = *(l.wi);
	layer wg = *(l.wg);
	layer wo = *(l.wo);

	layer uf = *(l.uf);
	layer ui = *(l.ui);
	layer ug = *(l.ug);
	layer uo = *(l.uo);

	fill_gpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1, state.st);
	fill_gpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1, state.st);
	fill_gpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1, state.st);
	fill_gpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1, state.st);

	fill_gpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1, state.st);
	fill_gpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1, state.st);
	fill_gpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1, state.st);
	fill_gpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1, state.st);
	if (state.train) {
		fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1, state.st);
	}

	for (i = 0; i < l.steps; ++i) {
		s.input_gpu = l.h_gpu;
		forward_connected_layer_gpu(wf, s);
		forward_connected_layer_gpu(wi, s);
		forward_connected_layer_gpu(wg, s);
		forward_connected_layer_gpu(wo, s);

		s.input_gpu = state.input_gpu;
		forward_connected_layer_gpu(uf, s);
		forward_connected_layer_gpu(ui, s);
		forward_connected_layer_gpu(ug, s);
		forward_connected_layer_gpu(uo, s);

		copy_gpu(l.outputs * l.batch, wf.output_gpu, 1, l.f_gpu, 1, state.st);
		axpy_gpu(l.outputs * l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1, state.st);

		copy_gpu(l.outputs * l.batch, wi.output_gpu, 1, l.i_gpu, 1, state.st);
		axpy_gpu(l.outputs * l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1, state.st);

		copy_gpu(l.outputs * l.batch, wg.output_gpu, 1, l.g_gpu, 1, state.st);
		axpy_gpu(l.outputs * l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1, state.st);

		copy_gpu(l.outputs * l.batch, wo.output_gpu, 1, l.o_gpu, 1, state.st);
		axpy_gpu(l.outputs * l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1, state.st);

		activate_array_gpu(l.f_gpu, l.outputs * l.batch, LOGISTIC, state.st);
		activate_array_gpu(l.i_gpu, l.outputs * l.batch, LOGISTIC, state.st);
		activate_array_gpu(l.g_gpu, l.outputs * l.batch, TANH, state.st);
		activate_array_gpu(l.o_gpu, l.outputs * l.batch, LOGISTIC, state.st);

		copy_gpu(l.outputs * l.batch, l.i_gpu, 1, l.temp_gpu, 1, state.st);
		mul_gpu(l.outputs * l.batch, l.g_gpu, 1, l.temp_gpu, 1, state.st);
		mul_gpu(l.outputs * l.batch, l.f_gpu, 1, l.c_gpu, 1, state.st);
		axpy_gpu(l.outputs * l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1, state.st);

		copy_gpu(l.outputs * l.batch, l.c_gpu, 1, l.h_gpu, 1, state.st);
		activate_array_gpu(l.h_gpu, l.outputs * l.batch, TANH, state.st);
		mul_gpu(l.outputs * l.batch, l.o_gpu, 1, l.h_gpu, 1, state.st);

		copy_gpu(l.outputs * l.batch, l.c_gpu, 1, l.cell_gpu, 1, state.st);
		copy_gpu(l.outputs * l.batch, l.h_gpu, 1, l.output_gpu, 1, state.st);

		state.input_gpu += l.inputs * l.batch;
		l.output_gpu += l.outputs * l.batch;
		l.cell_gpu += l.outputs * l.batch;

		increment_layer(&wf, 1);
		increment_layer(&wi, 1);
		increment_layer(&wg, 1);
		increment_layer(&wo, 1);

		increment_layer(&uf, 1);
		increment_layer(&ui, 1);
		increment_layer(&ug, 1);
		increment_layer(&uo, 1);
	}
}
int main(int argc, char** argv) {
    // set up parameters
    // first argument is the y dimension = 2^arg
    size_t pow    = read_arg(argc, argv, 1, 8);
    // second argument is the number of time steps
    size_t nsteps = read_arg(argc, argv, 2, 100);
    // third argument is nonzero if shared memory version is to be used
    bool use_shared = read_arg(argc, argv, 3, 0);

    // set domain size
    size_t nx = 128+2;
    size_t ny = (1 << pow)+2;
    double dt = 0.1;

    std::cout << "\n## " << nx << "x" << ny
              << " for " << nsteps << " time steps"
              << " (" << nx*ny << " grid points)\n";

    // allocate memory on device and host
    // note : allocate enough memory for the halo around the boundary
    auto buffer_size = nx*ny;

#ifdef OPENACC_DATA
    double *x0     = malloc_host_pinned<double>(buffer_size);
    double *x1     = malloc_host_pinned<double>(buffer_size);
#else
    double *x_host = malloc_host_pinned<double>(buffer_size);
    double *x0     = malloc_device<double>(buffer_size);
    double *x1     = malloc_device<double>(buffer_size);
#endif

    double start_diffusion, time_diffusion;

#ifdef OPENACC_DATA
    #pragma acc data create(x0[0:buffer_size]) copyout(x1[0:buffer_size])
#endif
    {
        // set initial conditions of 0 everywhere
        fill_gpu(x0, 0., buffer_size);
        fill_gpu(x1, 0., buffer_size);

        // set boundary conditions of 1 on south border
        fill_gpu(x0, 1., nx);
        fill_gpu(x1, 1., nx);
        fill_gpu(x0+nx*(ny-1), 1., nx);
        fill_gpu(x1+nx*(ny-1), 1., nx);

        // time stepping loop
        #pragma acc wait
        start_diffusion = get_time();
        for(auto step=0; step<nsteps; ++step) {
            diffusion_gpu(x0, x1, nx-2, ny-2, dt);
#ifdef OPENACC_DATA
            copy_gpu(x0, x1, buffer_size);
#else
            std::swap(x0, x1);
#endif
        }

        #pragma acc wait
        time_diffusion = get_time() - start_diffusion;
    } // end of acc data

#ifdef OPENACC_DATA
    auto x_res = x1;
#else
    copy_to_host<double>(x0, x_host, buffer_size);
    auto x_res = x_host;
#endif

    std::cout << "## " << time_diffusion << "s, "
              << nsteps*(nx-2)*(ny-2) / time_diffusion << " points/second\n\n";

    std::cout << "writing to output.bin/bov\n";
    write_to_file(nx, ny, x_res);
    return 0;
}
Exemple #5
0
void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display)
{
#ifdef GPU
    //char *train_images = "/home/kunle12/data/coco/train1.txt";
    //char *train_images = "/home/kunle12/data/coco/trainvalno5k.txt";
    char *train_images = "/home/kunle12/data/imagenet/imagenet1k.train.list";
    char *backup_directory = "/home/kunle12/backup/";
    srand(time(0));
    char *base = basecfg(cfg);
    char *abase = basecfg(acfg);
    printf("%s\n", base);
    network *net = load_network(cfg, weight, clear);
    network *anet = load_network(acfg, aweight, clear);

    int i, j, k;
    layer imlayer = {0};
    for (i = 0; i < net->n; ++i) {
        if (net->layers[i].out_c == 3) {
            imlayer = net->layers[i];
            break;
        }
    }

    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
    int imgs = net->batch*net->subdivisions;
    i = *net->seen/imgs;
    data train, buffer;


    list *plist = get_paths(train_images);
    //int N = plist->size;
    char **paths = (char **)list_to_array(plist);

    load_args args= get_base_args(net);
    args.paths = paths;
    args.n = imgs;
    args.m = plist->size;
    args.d = &buffer;

    args.type = CLASSIFICATION_DATA;
    args.classes = 1;
    char *ls[2] = {"imagenet"};
    args.labels = ls;

    pthread_t load_thread = load_data_in_thread(args);
    clock_t time;

    int x_size = net->inputs*net->batch;
    //int y_size = x_size;
    net->delta = 0;
    net->train = 1;
    float *pixs = calloc(x_size, sizeof(float));
    float *graypixs = calloc(x_size, sizeof(float));
    //float *y = calloc(y_size, sizeof(float));

    //int ay_size = anet->outputs*anet->batch;
    anet->delta = 0;
    anet->train = 1;

    float *imerror = cuda_make_array(0, imlayer.outputs*imlayer.batch);

    float aloss_avg = -1;
    float gloss_avg = -1;

    //data generated = copy_data(train);

    while (get_current_batch(net) < net->max_batches) {
        i += 1;
        time=clock();
        pthread_join(load_thread, 0);
        train = buffer;
        load_thread = load_data_in_thread(args);

        printf("Loaded: %lf seconds\n", sec(clock()-time));

        data gray = copy_data(train);
        for(j = 0; j < imgs; ++j){
            image gim = float_to_image(net->w, net->h, net->c, gray.X.vals[j]);
            grayscale_image_3c(gim);
            train.y.vals[j][0] = .95;
            gray.y.vals[j][0] = .05;
        }
        time=clock();
        float gloss = 0;

        for(j = 0; j < net->subdivisions; ++j){
            get_next_batch(train, net->batch, j*net->batch, pixs, 0);
            get_next_batch(gray, net->batch, j*net->batch, graypixs, 0);
            cuda_push_array(net->input_gpu, graypixs, net->inputs*net->batch);
            cuda_push_array(net->truth_gpu, pixs, net->truths*net->batch);
            /*
               image origi = float_to_image(net->w, net->h, 3, pixs);
               image grayi = float_to_image(net->w, net->h, 3, graypixs);
               show_image(grayi, "gray");
               show_image(origi, "orig");
               cvWaitKey(0);
             */
            *net->seen += net->batch;
            forward_network_gpu(net);

            fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
            copy_gpu(anet->inputs*anet->batch, imlayer.output_gpu, 1, anet->input_gpu, 1);
            fill_gpu(anet->inputs*anet->batch, .95, anet->truth_gpu, 1);
            anet->delta_gpu = imerror;
            forward_network_gpu(anet);
            backward_network_gpu(anet);

            scal_gpu(imlayer.outputs*imlayer.batch, 1./100., net->layers[net->n-1].delta_gpu, 1);

            scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);

            printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
            printf("features %f\n", cuda_mag_array(net->layers[net->n-1].delta_gpu, imlayer.outputs*imlayer.batch));

            axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net->layers[net->n-1].delta_gpu, 1);

            backward_network_gpu(net);


            gloss += *net->cost /(net->subdivisions*net->batch);

            for(k = 0; k < net->batch; ++k){
                int index = j*net->batch + k;
                copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1);
            }
        }
        harmless_update_network_gpu(anet);

        data merge = concat_data(train, gray);
        //randomize_data(merge);
        float aloss = train_network(anet, merge);

        update_network_gpu(net);

#ifdef OPENCV
        if(display){
            image im = float_to_image(anet->w, anet->h, anet->c, gray.X.vals[0]);
            image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]);
            show_image(im, "gen", 1);
            show_image(im2, "train", 1);
        }
#endif
        free_data(merge);
        free_data(train);
        free_data(gray);
        if (aloss_avg < 0) aloss_avg = aloss;
        aloss_avg = aloss_avg*.9 + aloss*.1;
        gloss_avg = gloss_avg*.9 + gloss*.1;

        printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs);
        if(i%1000==0){
            char buff[256];
            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
            save_weights(net, buff);
            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
            save_weights(anet, buff);
        }
        if(i%100==0){
            char buff[256];
            sprintf(buff, "%s/%s.backup", backup_directory, base);
            save_weights(net, buff);
            sprintf(buff, "%s/%s.backup", backup_directory, abase);
            save_weights(anet, buff);
        }
    }
    char buff[256];
    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
    save_weights(net, buff);
#endif
}
Exemple #6
0
void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch)
{
#ifdef GPU
    char *backup_directory = "/home/kunle12/backup/";
    srand(time(0));
    char *base = basecfg(cfg);
    char *abase = basecfg(acfg);
    printf("%s\n", base);
    network *gnet = load_network(cfg, weight, clear);
    network *anet = load_network(acfg, aweight, clear);
    //float orig_rate = anet->learning_rate;

    int i, j, k;
    layer imlayer = {0};
    for (i = 0; i < gnet->n; ++i) {
        if (gnet->layers[i].out_c == 3) {
            imlayer = gnet->layers[i];
            break;
        }
    }

    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay);
    int imgs = gnet->batch*gnet->subdivisions;
    i = *gnet->seen/imgs;
    data train, buffer;


    list *plist = get_paths(train_images);
    //int N = plist->size;
    char **paths = (char **)list_to_array(plist);

    load_args args= get_base_args(anet);
    args.paths = paths;
    args.n = imgs;
    args.m = plist->size;
    args.d = &buffer;
    args.type = CLASSIFICATION_DATA;
    args.threads=16;
    args.classes = 1;
    char *ls[2] = {"imagenet", "zzzzzzzz"};
    args.labels = ls;

    pthread_t load_thread = load_data_in_thread(args);
    clock_t time;

    gnet->train = 1;
    anet->train = 1;

    int x_size = gnet->inputs*gnet->batch;
    int y_size = gnet->truths*gnet->batch;
    float *imerror = cuda_make_array(0, y_size);

    //int ay_size = anet->truths*anet->batch;

    float aloss_avg = -1;

    //data generated = copy_data(train);

    if (maxbatch == 0) maxbatch = gnet->max_batches;
    while (get_current_batch(gnet) < maxbatch) {
        i += 1;
        time=clock();
        pthread_join(load_thread, 0);
        train = buffer;

        //translate_data_rows(train, -.5);
        //scale_data_rows(train, 2);

        load_thread = load_data_in_thread(args);

        printf("Loaded: %lf seconds\n", sec(clock()-time));

        data gen = copy_data(train);
        for (j = 0; j < imgs; ++j) {
            train.y.vals[j][0] = 1;
            gen.y.vals[j][0] = 0;
        }
        time=clock();

        for(j = 0; j < gnet->subdivisions; ++j){
            get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0);
            int z;
            for(z = 0; z < x_size; ++z){
                gnet->input[z] = rand_normal();
            }
            for(z = 0; z < gnet->batch; ++z){
                float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs);
                scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag);
            }
            /*
               for(z = 0; z < 100; ++z){
               printf("%f, ", gnet->input[z]);
               }
               printf("\n");
               printf("input: %f %f\n", mean_array(gnet->input, x_size), variance_array(gnet->input, x_size));
             */

            //cuda_push_array(gnet->input_gpu, gnet->input, x_size);
            //cuda_push_array(gnet->truth_gpu, gnet->truth, y_size);
            *gnet->seen += gnet->batch;
            forward_network(gnet);

            fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
            fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1);
            copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1);
            anet->delta_gpu = imerror;
            forward_network(anet);
            backward_network(anet);

            //float genaloss = *anet->cost / anet->batch;
            //printf("%f\n", genaloss);

            scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
            scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1);

            //printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
            //printf("features %f\n", cuda_mag_array(gnet->layers[gnet->n-1].delta_gpu, imlayer.outputs*imlayer.batch));

            axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1);

            backward_network(gnet);

            /*
               for(k = 0; k < gnet->n; ++k){
               layer l = gnet->layers[k];
               cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
               printf("%d: %f %f\n", k, mean_array(l.output, l.outputs*l.batch), variance_array(l.output, l.outputs*l.batch));
               }
             */

            for(k = 0; k < gnet->batch; ++k){
                int index = j*gnet->batch + k;
                copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1);
            }
        }
        harmless_update_network_gpu(anet);

        data merge = concat_data(train, gen);
        //randomize_data(merge);
        float aloss = train_network(anet, merge);

        //translate_image(im, 1);
        //scale_image(im, .5);
        //translate_image(im2, 1);
        //scale_image(im2, .5);
#ifdef OPENCV
        if(display){
            image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]);
            image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]);
            show_image(im, "gen", 1);
            show_image(im2, "train", 1);
            save_image(im, "gen");
            save_image(im2, "train");
        }
#endif

        /*
           if(aloss < .1){
           anet->learning_rate = 0;
           } else if (aloss > .3){
           anet->learning_rate = orig_rate;
           }
         */

        update_network_gpu(gnet);

        free_data(merge);
        free_data(train);
        free_data(gen);
        if (aloss_avg < 0) aloss_avg = aloss;
        aloss_avg = aloss_avg*.9 + aloss*.1;

        printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs);
        if(i%10000==0){
            char buff[256];
            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
            save_weights(gnet, buff);
            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
            save_weights(anet, buff);
        }
        if(i%1000==0){
            char buff[256];
            sprintf(buff, "%s/%s.backup", backup_directory, base);
            save_weights(gnet, buff);
            sprintf(buff, "%s/%s.backup", backup_directory, abase);
            save_weights(anet, buff);
        }
    }
    char buff[256];
    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
    save_weights(gnet, buff);
#endif
    free_network(gnet);
    free_network(anet);
}
Exemple #7
0
void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch)
{
#ifdef GPU
    char *backup_directory = "/home/kunle12/backup/";
    srand(time(0));
    char *base = basecfg(cfg);
    char *abase = basecfg(acfg);
    printf("%s\n", base);
    network *gnet = load_network(cfg, weight, clear);
    network *anet = load_network(acfg, aweight, clear);

    int i, j, k;
    layer imlayer = gnet->layers[gnet->n-1];

    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay);
    int imgs = gnet->batch*gnet->subdivisions;
    i = *gnet->seen/imgs;
    data train, buffer;


    list *plist = get_paths(train_images);
    char **paths = (char **)list_to_array(plist);

    load_args args= get_base_args(anet);
    args.paths = paths;
    args.n = imgs;
    args.m = plist->size;
    args.d = &buffer;
    args.type = CLASSIFICATION_DATA;
    args.threads=16;
    args.classes = 1;
    char *ls[2] = {"imagenet", "zzzzzzzz"};
    args.labels = ls;

    pthread_t load_thread = load_data_in_thread(args);
    clock_t time;

    gnet->train = 1;
    anet->train = 1;

    int x_size = gnet->inputs*gnet->batch;
    int y_size = gnet->truths*gnet->batch;
    float *imerror = cuda_make_array(0, y_size);

    float aloss_avg = -1;

    if (maxbatch == 0) maxbatch = gnet->max_batches;
    while (get_current_batch(gnet) < maxbatch) {
        {
            int cb = get_current_batch(gnet);
            float alpha = (float) cb / (maxbatch/2);
            if(alpha > 1) alpha = 1;
            float beta = 1 - alpha;
            printf("%f %f\n", alpha, beta);
            set_network_alpha_beta(gnet, alpha, beta);
            set_network_alpha_beta(anet, beta, alpha);
        }

        i += 1;
        time=clock();
        pthread_join(load_thread, 0);
        train = buffer;

        load_thread = load_data_in_thread(args);

        printf("Loaded: %lf seconds\n", sec(clock()-time));

        data gen = copy_data(train);
        for (j = 0; j < imgs; ++j) {
            train.y.vals[j][0] = 1;
            gen.y.vals[j][0] = 0;
        }
        time=clock();

        for (j = 0; j < gnet->subdivisions; ++j) {
            get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0);
            int z;
            for(z = 0; z < x_size; ++z){
                gnet->input[z] = rand_normal();
            }
            /*
               for(z = 0; z < gnet->batch; ++z){
               float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs);
               scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag);
               }
             */
            *gnet->seen += gnet->batch;
            forward_network(gnet);

            fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
            fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1);
            copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1);
            anet->delta_gpu = imerror;
            forward_network(anet);
            backward_network(anet);

            //float genaloss = *anet->cost / anet->batch;

            scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
            scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1);

            axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1);

            backward_network(gnet);

            for(k = 0; k < gnet->batch; ++k){
                int index = j*gnet->batch + k;
                copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1);
            }
        }
        harmless_update_network_gpu(anet);

        data merge = concat_data(train, gen);
        float aloss = train_network(anet, merge);

#ifdef OPENCV
        if(display){
            image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]);
            image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]);
            show_image(im, "gen", 1);
            show_image(im2, "train", 1);
            save_image(im, "gen");
            save_image(im2, "train");
        }
#endif

        update_network_gpu(gnet);

        free_data(merge);
        free_data(train);
        free_data(gen);
        if (aloss_avg < 0) aloss_avg = aloss;
        aloss_avg = aloss_avg*.9 + aloss*.1;

        printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs);
        if(i%10000==0){
            char buff[256];
            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
            save_weights(gnet, buff);
            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
            save_weights(anet, buff);
        }
        if(i%1000==0){
            char buff[256];
            sprintf(buff, "%s/%s.backup", backup_directory, base);
            save_weights(gnet, buff);
            sprintf(buff, "%s/%s.backup", backup_directory, abase);
            save_weights(anet, buff);
        }
    }
    char buff[256];
    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
    save_weights(gnet, buff);
#endif
  free_network( gnet );
  free_network( anet );
}
int main(int argc, char** argv) {
    // set up parameters
    // first argument is the y dimension = 2^arg
    size_t pow    = read_arg(argc, argv, 1, 8);
    // second argument is the number of time steps
    size_t nsteps = read_arg(argc, argv, 2, 100);
    // third argument is nonzero if shared memory version is to be used
    bool use_shared = read_arg(argc, argv, 3, 0);

    // set domain size
    size_t nx = 128;
    size_t ny = 1 << pow;
    double dt = 0.1;

    // initialize MPI
    int mpi_rank, mpi_size;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
    if (ny % mpi_size) {
        std::cout << "error : global domain dimension " << ny
                  << "must be divisible by number of MPI ranks "
                  << mpi_size << "\n";
        exit(1);
    } else if (mpi_rank == 0) {
        std::cout << "\n## " << mpi_size << " MPI ranks" << std::endl;
        std::cout << "## " << nx << "x" << ny
                  << " : " << nx << "x" << ny/mpi_size << " per rank"
                  << " for " << nsteps << " time steps"
                  << " (" << nx*ny << " grid points)\n";
    }

    ny /= mpi_size;

    // adjust dimensions for halo
    nx += 2;
    ny += 2;

    // allocate memory on device and host
    // note : allocate enough memory for the halo around the boundary
    auto buffer_size = nx*ny;

#ifdef OPENACC_DATA
    double *x0     = malloc_host_pinned<double>(buffer_size);
    double *x1     = malloc_host_pinned<double>(buffer_size);
#else
    double *x_host = (double *) malloc(buffer_size*sizeof(double));
    // double *x_host = malloc_host_pinned<double>(buffer_size);
    double *x0     = malloc_device<double>(buffer_size);
    double *x1     = malloc_device<double>(buffer_size);
#endif

    double start_diffusion, time_diffusion;

#ifdef OPENACC_DATA
    // TODO: move data to the GPU
#endif
    {
        // set initial conditions of 0 everywhere
        fill_gpu(x0, 0., buffer_size);
        fill_gpu(x1, 0., buffer_size);

        // set boundary conditions of 1 on south border
        if (mpi_rank == 0) {
            fill_gpu(x0, 1., nx);
            fill_gpu(x1, 1., nx);
        }

        if (mpi_rank == mpi_size-1) {
            fill_gpu(x0+nx*(ny-1), 1., nx);
            fill_gpu(x1+nx*(ny-1), 1., nx);
        }

        auto south = mpi_rank - 1;
        auto north = mpi_rank + 1;

        // time stepping loop
        #pragma acc wait
        start_diffusion = get_time();
        for(auto step=0; step<nsteps; ++step) {
            MPI_Request requests[4];
            MPI_Status  statuses[4];
            auto num_requests = 0;

#ifdef OPENACC_DATA
            // TODO: There are two ways to communicate:
            //   1. Update the host copy first and then communicate
            //   2. Use the optimised RDMA data path
#endif
            {
                if (south >= 0) {
                    // x0(:, 0) <- south
                    MPI_Irecv(x0,    nx, MPI_DOUBLE, south, 0, MPI_COMM_WORLD,
                              &requests[0]);
                    // x0(:, 1) -> south
                    MPI_Isend(x0+nx, nx, MPI_DOUBLE, south, 0, MPI_COMM_WORLD,
                              &requests[1]);
                    num_requests += 2;
                }

                // exchange with north
                if(north < mpi_size) {
                    // x0(:, ny-1) <- north
                    MPI_Irecv(x0+(ny-1)*nx, nx, MPI_DOUBLE, north, 0,
                              MPI_COMM_WORLD, &requests[num_requests]);
                    // x0(:, ny-2) -> north
                    MPI_Isend(x0+(ny-2)*nx, nx, MPI_DOUBLE, north, 0,
                              MPI_COMM_WORLD, &requests[num_requests+1]);
                    num_requests += 2;
                }
            }

            MPI_Waitall(num_requests, requests, statuses);

            diffusion_gpu(x0, x1, nx-2, ny-2, dt);

#ifdef OPENACC_DATA
            copy_gpu(x0, x1, buffer_size);
#else
            std::swap(x0, x1);
#endif
        }

        #pragma acc wait
        time_diffusion = get_time() - start_diffusion;
    } // end of acc data

#ifdef OPENACC_DATA
    auto x_res = x1;
#else
    copy_to_host<double>(x0, x_host, buffer_size);
    auto x_res = x_host;
#endif

    if (mpi_rank == 0) {
        std::cout << "## " << time_diffusion << "s, "
                  << nsteps*(nx-2)*(ny-2)*mpi_size / time_diffusion
                  << " points/second\n\n";

        std::cout << "writing to output.bin/bov\n";
        write_to_file(nx, ny, x_res);
    }

    MPI_Finalize();
    return 0;
}