void conv3d_blas_cpu::bprop() { create_dX(); create_dF(); create_dB(); init_convmat(); init_u(); mwSize N = getVolN(X); matw dF_ = make_dF_(); matw dB_ = make_dB_(); for (mwSize i = 0; i < N; ++i) { // make phiX: the convolution matrix vol_to_convmat(X, i); // dF += phiX' * dY_ matw dY_ = make_dY_(i); ATxBtoC(convmat, dY_, dF_, false); // accumulation on dF_ // dB += u' * dY ATxBtoC(u, dY_, dB_, false); // accumulation on dB_ // dphiX = dY * F' matw F_ = make_F_(); // safe to reuse convmat memory, remember to overwrite it! AxBTtoC(dY_, F_, convmat, true); // dX(:,:,:,:,i) <-- dphiX vol_from_convmat(dX, i); } free_u(); free_convmat(); }
void conv3d_blas_cpu::fprop() { create_Y(); init_convmat(); init_u(); // iterate over each training instance mwSize N = getVolN(X); for (mwSize i = 0; i < N; i++) { // make phiX: the convolution matrix vol_to_convmat(X, i); // convolution: Y_ = phiX * F_ matw F_ = make_F_(); matw Y_ = make_Y_(i); AxBtoC(convmat, F_, Y_, true); // overwrite Y_ // plus the bias: Y_ += u * B matw B_ = make_B_(); AxBtoC(u, B_, Y_, false); // accumulation on Y_ } free_u(); free_convmat(); }
/** * \brief main function */ int main(int argc, char *argv[]) { int ierr; int num_procs; int tag = 1; double h = 1; int steps = 1; char *basedir = NULL; udata_t udata; udata.nx = 1; udata.ny = 1; udata.dimension = 16; udata.algorithm = 0; udata.picturesteps = 1; // initialize MPI ierr = MPI_Init(&argc, &argv); if (ierr) { fprintf(stderr, "cannot initialize MPI: %d\n", ierr); return EXIT_FAILURE; } // get MPI dimension parameters ierr = MPI_Comm_rank(MPI_COMM_WORLD, &udata.rank); ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_procs); char rankprefix[10]; snprintf(rankprefix, sizeof(rankprefix), "%d", udata.rank); if (debug) { fprintf(stderr, "%s:%d[%d]: process id %d\n", __FILE__, __LINE__, udata.rank, getpid()); } // parse the command line int c; while (EOF != (c = getopt(argc, argv, "b:dh:r:s:t:x:y:n:a:?"))) switch (c) { case 'd': debug++; break; case 's': udata.picturesteps = atoi(optarg); break; case 't': udata.maxsteps = atof(optarg); break; case 'x': udata.nx = atoi(optarg); break; case 'y': udata.ny = atoi(optarg); break; case 'b': basedir = optarg; break; case 'n': udata.dimension = atoi(optarg); break; case 'a': udata.algorithm = atoi(optarg); break; case '?': usage(argv[0]); return EXIT_SUCCESS; } udata.maxsteps = udata.dimension + udata.dimension; udata.h = 1 / udata.dimension; // make sure the arguments are consistent if (num_procs != udata.nx * udata.ny) { fprintf(stderr, "number of processes does not match " "dimensions: %d != %d x %d\n", num_procs, udata.nx, udata.ny); usage(argv[0]); return EXIT_FAILURE; } // compute horizontal and vertical index of this rank udata.rh = udata.rank % udata.nx; udata.rv = udata.rank / udata.nx; if (debug) { fprintf(stderr, "%s:%d[%d]: rh = %d, rv = %d\n", __FILE__, __LINE__, udata.rank, udata.rh, udata.rv); } // next argument is image file name if (argc <= optind) { fprintf(stderr, "image file name argument missing\n"); usage(argv[0]); return EXIT_FAILURE; } char *imagefilename = argv[optind++]; // next argument is output file name char *netcdffilename = NULL; if (argc > optind) { netcdffilename = argv[optind++]; if (debug) { fprintf(stderr, "%s:%d: netcdffilename: %s\n", __FILE__, __LINE__, netcdffilename); } } // image file and output file heatfile_t *hf = NULL; image_t *image = NULL; // process zero initializes and writes data if (udata.rank == 0) { // read the image file image = readimage(imagefilename); if (NULL == image) { fprintf(stderr, "cannot read image\n"); return EXIT_FAILURE; } if (debug) { fprintf(stderr, "%s:%d[%d]: %d x %d image read\n", __FILE__, __LINE__, udata.rank, image->width, image->height); } // create the output file if (netcdffilename) { if (debug) { fprintf(stderr, "%s:%d: creating NetCDF %s\n", __FILE__, __LINE__, netcdffilename); } hf = output2_create(netcdffilename, h, steps * udata.ht, image->width, image->height); if (NULL == hf) { fprintf(stderr, "cannot create output file\n"); return EXIT_FAILURE; } } } // write the first image if ((basedir) && (udata.rank == 0)) { char outfilename[1024]; snprintf(outfilename, sizeof(outfilename), "%s/00000.fits", basedir); writeimage(image, outfilename); } // index ranges for each rank udata.ranges = (int *)malloc(4 * num_procs * sizeof(int)); if (udata.rank == 0) { partitiondomain(&udata, image); } // exchange range size information with all other ranks. The ranks // then pick the dimensions they need from the array, this is // the purpose of the range pointer MPI_Bcast(udata.ranges, 4 * num_procs, MPI_INT, 0, MPI_COMM_WORLD); int *range = &udata.ranges[4 * udata.rank]; if (debug) { fprintf(stderr, "%s:%d:[%d]: [%d,%d) x [%d,%d)\n", __FILE__, __LINE__, udata.rank, range[0], range[1], range[2], range[3]); } // allocate memory for the area we are responsible for udata.width = range[1] - range[0]; udata.height = range[3] - range[2]; allocate_u(&udata); double *unew = (double *)malloc(udata.length * sizeof(double)); if (debug) { fprintf(stderr, "%s:%d[%d]: arrays allocated, %d x %d\n", __FILE__, __LINE__, udata.rank, udata.width, udata.height); } // write initial data to the output file if ((hf) && (udata.rank == 0)) { output2_add(hf, 0, image->data); } // measure start time (after all allocations are done) double start = gettime(); // process 0 has to send the data to all the other processes if (udata.rank == 0) { for (int r = 1; r < num_procs; r++) { sendimagerange(&udata, image, r, tag); } copyfromimage(&udata, image); } else { // receive my part of the matrix receiverange(&udata, tag); } tag++; // make sure dimension is correct if (udata.dimension != udata.height + udata.width) { fprintf(stderr, "dimension does not match\n"); return EXIT_FAILURE; } // start the solver algorithm int stepcounter = 0; // counter for time steps while (stepcounter < udata.maxsteps) { stepcounter++; // copy everything to unew as the initial approximation for (int i = 0; i < udata.length; i++) { unew[i] = udata.u[i]; } // now perform <picturesteps> iterations for (int k = 0; k < udata.picturesteps; k++) { // synchronize current values of boundary with neighbors tag++; exchange_boundaries(&udata, tag); // perform iteration step iterate_u(unew, &udata); // copy the new u to the old u / only used for Jacobi if (udata.algorithm == 0) { for (int i = 0; i < udata.length; i++) { udata.u[i] = unew[i]; } } } // decide whether we have to output something if (0) { // time value for this data output int stepvalue = stepcounter / udata.picturesteps; // output needed, so we synchronize image data tag++; synchronize_image(&udata, image, tag); // write an image if ((basedir) && (udata.rank == 0)) { char outfilename[1024]; snprintf(outfilename, sizeof(outfilename), "%s/%05d.fits", basedir, stepvalue); writeimage(image, outfilename); } // write solution data if ((hf) && (udata.rank == 0)) { output2_add(hf, stepvalue, image->data); } } } // measure end time double end = gettime(); // we are now done, rank 0 displays the result if (udata.rank == 0) { printf("%.6f",end - start); } // close the netcdf file if ((udata.rank == 0) && (hf)) { output_close(hf); } // cleanup MPI MPI_Finalize(); // cleanup the memory we have allocated free(udata.ranges); udata.ranges = NULL; free_u(&udata); return EXIT_SUCCESS; }