Exemplo n.º 1
0
int main(int argc, char** argv) {

    Communicator* comm_unit = new Communicator(argc, argv);

    ProjectSettings* settings = new ProjectSettings(argc, argv, comm_unit->getRank());

    int dim = settings->GetSettingAs<int>("DIMENSION", ProjectSettings::required); 
    int nx = settings->GetSettingAs<int>("NB_X", ProjectSettings::required); 
    int ny = 1; 
    int nz = 1; 
    if (dim > 1) {
    	ny = settings->GetSettingAs<int>("NB_Y", ProjectSettings::required); 
    }
    if (dim > 2) {
	nz = settings->GetSettingAs<int> ("NB_Z", ProjectSettings::required); 
    } 
    if (dim > 3) {
	cout << "ERROR! Dim > 3 Not supported!" << endl;
	exit(EXIT_FAILURE); 
    }

    double minX = settings->GetSettingAs<double>("MIN_X", ProjectSettings::optional, "-1."); 	
    double maxX = settings->GetSettingAs<double>("MAX_X", ProjectSettings::optional, "1."); 	
    double minY = settings->GetSettingAs<double>("MIN_Y", ProjectSettings::optional, "-1."); 	
    double maxY = settings->GetSettingAs<double>("MAX_Y", ProjectSettings::optional, "1."); 	
    double minZ = settings->GetSettingAs<double>("MIN_Z", ProjectSettings::optional, "-1."); 	
    double maxZ = settings->GetSettingAs<double>("MAX_Z", ProjectSettings::optional, "1."); 

    double stencil_size = settings->GetSettingAs<int>("STENCIL_SIZE", ProjectSettings::required); 

    int use_gpu = settings->GetSettingAs<int>("USE_GPU", ProjectSettings::optional, "1"); 

    Grid* grid = NULL; 
 
    if (dim == 1) {
	    grid = new RegularGrid(nx, 1, minX, maxX, 0., 0.); 
    } else if (dim == 2) {
	    grid = new RegularGrid(nx, ny, minX, maxX, minY, maxY); 
    } else if (dim == 3) {
	    grid = new RegularGrid(nx, ny, nz, minX, maxX, minY, maxY, minZ, maxZ); 
    } else {
	    cout << "ERROR! Dim > 3 Not Supported!" << endl;
    }

    grid->setSortBoundaryNodes(true); 
    grid->generate();
    grid->generateStencils(stencil_size, Grid::ST_BRUTE_FORCE);   // nearest nb_points
    grid->writeToFile(); 


    // 0: 2D problem; 1: 3D problem
    //ExactSolution* exact_heat_regulargrid = new ExactRegularGrid(dim, 1.0, 1.0);

    RBFFD* der;
    if (use_gpu) {
        der = new RBFFD_CL(RBFFD::X | RBFFD::Y | RBFFD::Z | RBFFD::LAPL, grid, dim); 
    } else {
        der = new RBFFD(RBFFD::X | RBFFD::Y | RBFFD::Z | RBFFD::LAPL, grid, dim); 
    }

    double epsilon = settings->GetSettingAs<double>("EPSILON");
    der->setEpsilon(epsilon);

    printf("start computing weights\n");
    //vector<StencilType>& stencil = grid->getStencils();
    vector<NodeType>& rbf_centers = grid->getNodeList();
    der->computeAllWeightsForAllStencils();
    cout << "end computing weights" << endl;

    vector<double> u(rbf_centers.size(),1.);
    cout << "start computing derivative (on CPU)" << endl;
	    

    vector<double> xderiv_cpu(rbf_centers.size());	
    vector<double> xderiv_gpu(rbf_centers.size());	
    vector<double> yderiv_cpu(rbf_centers.size());	
    vector<double> yderiv_gpu(rbf_centers.size());	
    vector<double> zderiv_cpu(rbf_centers.size());	
    vector<double> zderiv_gpu(rbf_centers.size());	
    vector<double> lderiv_cpu(rbf_centers.size());	
    vector<double> lderiv_gpu(rbf_centers.size());	

    // Verify that the CPU works
    // NOTE: we pass booleans at the end of the param list to indicate that
    // the function "u" is new (true) or same as previous calls (false). This
    // helps avoid overhead of passing "u" to the GPU.
    der->RBFFD::applyWeightsForDeriv(RBFFD::X, u, xderiv_cpu, true);
    der->RBFFD::applyWeightsForDeriv(RBFFD::Y, u, yderiv_cpu, false); // originally false
    der->RBFFD::applyWeightsForDeriv(RBFFD::Z, u, zderiv_cpu, false); // orig false
    der->RBFFD::applyWeightsForDeriv(RBFFD::LAPL, u, lderiv_cpu, false); // orig false

    der->applyWeightsForDeriv(RBFFD::X, u, xderiv_gpu, true);
    der->applyWeightsForDeriv(RBFFD::Y, u, yderiv_gpu, false); // orig false
    der->applyWeightsForDeriv(RBFFD::Z, u, zderiv_gpu, false); // orig: false
    der->applyWeightsForDeriv(RBFFD::LAPL, u, lderiv_gpu, false); // orig: false


    double max_diff = 0.; 
    for (size_t i = 0; i < rbf_centers.size(); i++) {
	double xdiff = fabs(xderiv_gpu[i] - xderiv_cpu[i]); 
	double ydiff = fabs(yderiv_gpu[i] - yderiv_cpu[i]);
	double zdiff = fabs(zderiv_gpu[i] - zderiv_cpu[i]);
	double ldiff = fabs(lderiv_gpu[i] - lderiv_cpu[i]);

	if (xdiff > max_diff) { max_diff = xdiff; }
	if (ydiff > max_diff) { max_diff = ydiff; }
	if (zdiff > max_diff) { max_diff = zdiff; }
	if (ldiff > max_diff) { max_diff = ldiff; }

//        std::cout << "cpu_x_deriv[" << i << "] - gpu_x_deriv[" << i << "] = " << xderiv_cpu[i] - xderiv_gpu[i] << std::endl;
        if (( xdiff > 1e-5) 
        || ( ydiff > 1e-5) 
        || ( zdiff > 1e-5) 
        || ( ldiff > 1e-5))
        {
            std::cout << "WARNING! SINGLE PRECISION GPU COULD NOT CALCULATE DERIVATIVE WELL ENOUGH!\n";
	    std::cout << "Test failed on " << i << std::endl;
	    std::cout << "X: " << xderiv_gpu[i] - xderiv_cpu[i] << std:: endl; 
	    std::cout << "X: " << xderiv_gpu[i] << ", " <<  xderiv_cpu[i] << std:: endl; 
	    std::cout << "Y: " << yderiv_gpu[i] - yderiv_cpu[i] << std:: endl; 
	    std::cout << "Y: " << yderiv_gpu[i] << ", " <<  yderiv_cpu[i] << std:: endl; 
	    std::cout << "Z: " << zderiv_gpu[i] - zderiv_cpu[i] << std:: endl; 
	    std::cout << "Z: " << zderiv_gpu[i] << ", " <<  zderiv_cpu[i] << std:: endl; 
	    std::cout << "LAPL: " << lderiv_gpu[i] - lderiv_cpu[i] << std:: endl; 
            exit(EXIT_FAILURE); 
        }
    }
    std::cout << "Max difference between weights: " << max_diff << std::endl;
    std::cout << "CONGRATS! ALL DERIVATIVES WERE CALCULATED THE SAME IN OPENCL AND ON THE CPU\n";
       // (WITH AN AVERAGE ERROR OF:" << avg_error << std::endl;

   // der->applyWeightsForDeriv(RBFFD::Y, u, yderiv);
   // der->applyWeightsForDeriv(RBFFD::LAPL, u, lapl_deriv);


#if 0
    if (settings->GetSettingAs<int>("RUN_DERIVATIVE_TESTS")) {
        RBFFDTests* der_test = new DerivativeTests();
        der_test->testAllFunctions(*der, *grid);
    }
#endif 


//    delete(subdomain);
    delete(grid);
    delete(settings);

    cout.flush();

    exit(EXIT_SUCCESS);
}
Exemplo n.º 2
0
int main(int argc, char** argv) {

    ProjectSettings* settings = new ProjectSettings(argc, argv);


    int dim = settings->GetSettingAs<int>("DIMENSION", ProjectSettings::required);
    int nb_interior = settings->GetSettingAs<int>("NB_INTERIOR", ProjectSettings::required);
    int nb_inner_boundary = settings->GetSettingAs<int>("NB_INNER_BOUNDARY", ProjectSettings::optional, "0");
    int nb_outer_boundary = settings->GetSettingAs<int>("NB_OUTER_BOUNDARY", ProjectSettings::optional, "0");
    int nb_boundary = nb_inner_boundary + nb_outer_boundary;
    int nb_total = nb_interior + nb_boundary;

    if (dim > 3) {
        cout << "ERROR! Dim > 3 Not supported!" << endl;
        exit(EXIT_FAILURE);
    }

    double inner_r = settings->GetSettingAs<double>("INNER_RADIUS", ProjectSettings::optional, "0.5");
    double outer_r = settings->GetSettingAs<double>("OUTER_RADIUS", ProjectSettings::optional, "1.0");

    double inner_axis_major = settings->GetSettingAs<double>("INNER_AXIS_MAJOR", ProjectSettings::optional, "0.");
    double inner_axis_minor = settings->GetSettingAs<double>("INNER_AXIS_MINOR", ProjectSettings::optional, "0.");
    double outer_axis_major = settings->GetSettingAs<double>("OUTER_AXIS_MAJOR", ProjectSettings::optional, "0.");
    double outer_axis_minor = settings->GetSettingAs<double>("OUTER_AXIS_MINOR", ProjectSettings::optional, "0.");

    int ns_nx = settings->GetSettingAs<int>("NS_NB_X", ProjectSettings::optional, "10");
    int ns_ny = settings->GetSettingAs<int>("NS_NB_Y", ProjectSettings::optional, "10");
    int ns_nz = settings->GetSettingAs<int>("NS_NB_Z", ProjectSettings::optional, "10");

    double minX = settings->GetSettingAs<double>("MIN_X", ProjectSettings::optional, "-1.");
    double maxX = settings->GetSettingAs<double>("MAX_X", ProjectSettings::optional, "1.");
    double minY = settings->GetSettingAs<double>("MIN_Y", ProjectSettings::optional, "-1.");
    double maxY = settings->GetSettingAs<double>("MAX_Y", ProjectSettings::optional, "1.");
    double minZ = settings->GetSettingAs<double>("MIN_Z", ProjectSettings::optional, "-1.");
    double maxZ = settings->GetSettingAs<double>("MAX_Z", ProjectSettings::optional, "1.");

    double debug = settings->GetSettingAs<int>("DEBUG", ProjectSettings::optional, "0");


    // 0 = Dirichlet, 1 = neumann, 2 = robin
    int boundary_condition = settings->GetSettingAs<int>("BOUNDARY_CONDITION", ProjectSettings::optional, "0");
    // 0 = discrete rhs, 1 = exact (test discrete compat condition)
    int use_discrete_rhs = settings->GetSettingAs<int>("USE_DISCRETE_RHS", ProjectSettings::optional, "0");
    // 0 = assume non-uniform diffusion, 1 = assume uniform
    int use_uniform_diffusion = settings->GetSettingAs<int>("USE_UNIFORM_DIFFUSION", ProjectSettings::optional, "1");
    int run_derivative_tests = settings->GetSettingAs<int>("RUN_DERIVATIVE_TESTS", ProjectSettings::optional, "1");

    int stencil_size = settings->GetSettingAs<int>("STENCIL_SIZE", ProjectSettings::required);

    int use_gpu = settings->GetSettingAs<int>("USE_GPU", ProjectSettings::optional, "1");

    int nb_samples = settings->GetSettingAs<int>("NB_CVT_SAMPLES", ProjectSettings::required);
    int it_max_interior = settings->GetSettingAs<int>("NB_CVT_ITERATIONS", ProjectSettings::required);
    // Generate a CVT with nx*ny*nz nodes, in 1, 2 or 3D with 0 locked boundary nodes,
    // 20000 samples per iteration for 30 iterations
    NestedEllipseCVT* grid;
    if (nb_boundary) {
        // Specify the exact number of nodes on the boundary
        grid = new NestedEllipseCVT(nb_total, nb_inner_boundary, nb_outer_boundary, dim, new UniformDensity(), 0, nb_samples, it_max_interior);
    } else {
        // Guess the number of nodes on the boundary (usually looks nicer)
        grid = new NestedEllipseCVT(nb_total, dim, new UniformDensity(), 0, nb_samples, it_max_interior);
    }
    grid->setExtents(minX, maxX, minY, maxY, minZ, maxZ);

    if (!inner_axis_minor) {
        grid->setInnerRadius(inner_r);
        grid->setOuterRadius(outer_r);
    } else {
        grid->setInnerAxes(inner_axis_major, inner_axis_minor);
        grid->setOuterAxes(outer_axis_major, outer_axis_minor);
    }

    grid->setDebug(debug);
    grid->setMaxStencilSize(stencil_size);
    grid->setNSHashDims(ns_nx, ns_ny, ns_nz);


    int writeIntermediate = 2;

    Grid::GridLoadErrType err = grid->loadFromFile();
        if (err == Grid::NO_GRID_FILES)
        {
            printf("************** Generating new Grid **************\n");
            grid->setSortBoundaryNodes(true);
            grid->generate();
            if(writeIntermediate > 0) {
                grid->writeToFile();
            }
        }
        if ((err == Grid::NO_GRID_FILES) || (err == Grid::NO_STENCIL_FILES)) {
            std::cout << "Generating stencils files\n";
            grid->setNSHashDims(ns_nx, ns_ny, ns_nz);
//            grid->generateStencils(Grid::ST_BRUTE_FORCE);
//            grid->generateStencils(Grid::ST_KDTREE);
            grid->generateStencils(Grid::ST_HASH);
            if(writeIntermediate > 0) {
                grid->writeToFile();
            }
        }

        // 0: 2D problem; 1: 3D problem
    ExactSolution* exact_poisson;
    if (dim == 3) {
        std::cout << "ERROR! 3D not verified yet! exiting..." << std::endl;
        exit(EXIT_FAILURE);
        //     exact_poisson = new ExactNCARPoisson1();        // 3D problem is not verified yet
    } else {
        exact_poisson = new ExactNCARPoisson2();        // 2D problem works with uniform diffusion
    }

    RBFFD* der;
#if 0
    if (use_gpu) {
        der = new RBFFD_CL(RBFFD::X|RBFFD::Y|RBFFD::Z|RBFFD::LAPL,grid, dim);
    } else {
        der = new RBFFD(RBFFD::X|RBFFD::Y|RBFFD::Z|RBFFD::LAPL,grid, dim);
    }
#endif

    std::cout << "Computing weights for DIM = " << dim << std::endl;
    // No support for ViennaCL generated weights yet.
    der = new RBFFD(RBFFD::X|RBFFD::Y|RBFFD::Z|RBFFD::LAPL,grid, dim);

   // Enable variable epsilon. Not verified to be perfected in the derivative calculation.
   // But it has improved the heat equation already
    int use_var_eps = settings->GetSettingAs<int>("USE_VAR_EPSILON", ProjectSettings::optional, "0");
    if (use_var_eps) {
        double alpha = settings->GetSettingAs<double>("VAR_EPSILON_ALPHA", ProjectSettings::optional, "1.0");
        double beta = settings->GetSettingAs<double>("VAR_EPSILON_BETA", ProjectSettings::optional, "1.0");
        der->setVariableEpsilon(alpha, beta);
    } else {
        double epsilon = settings->GetSettingAs<double>("EPSILON", ProjectSettings::required);
        der->setEpsilon(epsilon);
    }

    der->computeAllWeightsForAllStencils();

    if (run_derivative_tests) {
        std::cout << "Running Derivative Tests\n";
        DerivativeTests* der_test = new DerivativeTests(dim, der, grid, true);
        if (use_gpu) {
            // Applies weights on both GPU and CPU and compares results for the first 10 stencils
            der_test->compareGPUandCPUDerivs(10);
        }
        // Test approximations to derivatives of functions f(x,y,z) = 0, x, y, xy, etc. etc.
        der_test->testAllFunctions();
        // For now we can only test eigenvalues on an MPI size of 1 (we could distribute with Par-Eiegen solver)
        if (settings->GetSettingAs<int>("DERIVATIVE_EIGENVALUE_TEST", ProjectSettings::optional, "0"))
        {
            // FIXME: why does this happen? Perhaps because X Y and Z are unidirectional?
            // Test X and 4 eigenvalues are > 0
            // Test Y and 30 are > 0
            // Test Z and 36 are > 0
            // NOTE: the 0 here implies we compute the eigenvalues but do not run the iterations of the random perturbation test
            der_test->testEigen(RBFFD::LAPL, 0);
        }
    }


    NCARPoisson1* poisson;
//    if (use_gpu) {
    if (true) {
        poisson = new NonUniformPoisson1_CL(exact_poisson, grid, der, 0, dim);
    } else {
        poisson = new NCARPoisson1(exact_poisson, grid, der, 0, dim);
    }
    poisson->setBoundaryCondition(boundary_condition);
    poisson->setUseDiscreteRHS(use_discrete_rhs);
    poisson->setUseUniformDiffusivity(use_uniform_diffusion);

    poisson->initialConditions();
    poisson->solve();

    delete(poisson);
//    delete(der);
    delete(grid);
    delete(settings);
#if 0
    Grid* grid2 = new Grid();
    grid2->loadFromFile("initial_grid.ascii");
    grid2->writeToFile("final_grid.ascii");

    cout.flush();
#endif
    exit(EXIT_SUCCESS);
}
Exemplo n.º 3
0
int main(int argc, char** argv) {
    TimerList tm;

    tm["total"] = new Timer("[Main] Total runtime for this proc");
    tm["grid"] = new Timer("[Main] Grid generation");
    tm["stencils"] = new Timer("[Main] Stencil generation");
    tm["settings"] = new Timer("[Main] Load settings"); 
    tm["decompose"] = new Timer("[Main] Decompose domain"); 
    tm["consolidate"] = new Timer("[Main] Consolidate subdomain solutions"); 
    tm["updates"] = new Timer("[Main] Broadcast solution updates"); 
    tm["send"] = new Timer("[Main] Send subdomains to other processors (master only)"); 
    tm["receive"] = new Timer("[Main] Receive subdomain from master (clients only)"); 
    tm["timestep"] = new Timer("[Main] Advance One Timestep"); 
    tm["tests"] = new Timer("[Main] Test stencil weights"); 
    tm["weights"] = new Timer("[Main] Compute all stencils weights"); 
    tm["oneWeight"] = new Timer("[Main] Compute single stencil weights"); 
    tm["heat_init"] = new Timer("[Main] Initialize heat"); 
    // grid should only be valid instance for MASTER
    Grid* grid = NULL; 
    Domain* subdomain; 

    tm["total"]->start(); 

    Communicator* comm_unit = new Communicator(argc, argv);

    cout << " Got Rank: " << comm_unit->getRank() << endl;
    cout << " Got Size: " << comm_unit->getSize() << endl;

    tm["settings"]->start(); 

    ProjectSettings* settings = new ProjectSettings(argc, argv, comm_unit->getRank());

    int dim = settings->GetSettingAs<int>("DIMENSION", ProjectSettings::required); 

    //-----------------
    fillGlobalProjectSettings(dim, settings);
    //-----------------

    int max_num_iters = settings->GetSettingAs<int>("MAX_NUM_ITERS", ProjectSettings::required); 
    double max_global_rel_error = settings->GetSettingAs<double>("MAX_GLOBAL_REL_ERROR", ProjectSettings::optional, "1e-1"); 
    double max_local_rel_error = settings->GetSettingAs<double>("MAX_LOCAL_REL_ERROR", ProjectSettings::optional, "1e-1"); 

    int use_gpu = settings->GetSettingAs<int>("USE_GPU", ProjectSettings::optional, "1"); 
    
    int local_sol_dump_frequency = settings->GetSettingAs<int>("LOCAL_SOL_DUMP_FREQUENCY", ProjectSettings::optional, "100"); 
    int global_sol_dump_frequency = settings->GetSettingAs<int>("GLOBAL_SOL_DUMP_FREQUENCY", ProjectSettings::optional, "200"); 

    int prompt_to_continue = settings->GetSettingAs<int>("PROMPT_TO_CONTINUE", ProjectSettings::optional, "0"); 
    int debug = settings->GetSettingAs<int>("DEBUG", ProjectSettings::optional, "0"); 

    double start_time = settings->GetSettingAs<double>("START_TIME", ProjectSettings::optional, "0.0"); 
    double end_time = settings->GetSettingAs<double>("END_TIME", ProjectSettings::optional, "1.0"); 
    double dt = settings->GetSettingAs<double>("DT", ProjectSettings::optional, "1e-5"); 
    int timescheme = settings->GetSettingAs<int>("TIME_SCHEME", ProjectSettings::optional, "1"); 
    int weight_method = settings->GetSettingAs<int>("WEIGHT_METHOD", ProjectSettings::optional, "1"); 
    int compute_eigenvalues = settings->GetSettingAs<int>("DERIVATIVE_EIGENVALUE_TEST", ProjectSettings::optional, "0");
    int use_eigen_dt = settings->GetSettingAs<int>("USE_EIGEN_DT", ProjectSettings::optional, "1");

    if (comm_unit->isMaster()) {

        int ns_nx = settings->GetSettingAs<int>("NS_NB_X", ProjectSettings::optional, "10"); 
        int ns_ny = settings->GetSettingAs<int>("NS_NB_Y", ProjectSettings::optional, "10");
        int ns_nz = settings->GetSettingAs<int>("NS_NB_Z", ProjectSettings::optional, "10");

        int stencil_size = settings->GetSettingAs<int>("STENCIL_SIZE", ProjectSettings::required); 

        tm["settings"]->stop(); 

        grid = getGrid(dim);

        grid->setMaxStencilSize(stencil_size); 

        Grid::GridLoadErrType err = grid->loadFromFile(); 
        if (err == Grid::NO_GRID_FILES) 
        {
            printf("************** Generating new Grid **************\n"); 
            //grid->setSortBoundaryNodes(true); 
//            grid->setSortBoundaryNodes(true); 
            tm["grid"]->start(); 
            grid->generate();
            tm["grid"]->stop(); 
            grid->writeToFile(); 
        } 
        if ((err == Grid::NO_GRID_FILES) || (err == Grid::NO_STENCIL_FILES)) {
            std::cout << "Generating stencils files\n";
            tm["stencils"]->start(); 
            grid->setNSHashDims(ns_nx, ns_ny, ns_nz);
//            grid->generateStencils(Grid::ST_BRUTE_FORCE);   
            // DEFINTELY: exact
            grid->generateStencils(Grid::ST_KDTREE);   
            // MIGHT BE: approximate
//            grid->generateStencils(Grid::ST_HASH);   
            tm["stencils"]->stop();
            grid->writeToFile(); 
            tm.writeToFile("gridgen_timer_log"); 
        }

        int x_subdivisions = comm_unit->getSize();		// reduce this to impact y dimension as well 
        int y_subdivisions = (comm_unit->getSize() - x_subdivisions) + 1; 

        // TODO: load subdomain from disk

        // Construct a new domain given a grid. 
        // TODO: avoid filling sets Q, B, etc; just think of it as a copy constructor for a grid
        Domain* original_domain = new Domain(dim, grid, comm_unit->getSize()); 
        // pre allocate pointers to all of the subdivisions
        std::vector<Domain*> subdomain_list(x_subdivisions*y_subdivisions);
        // allocate and fill in details on subdivisions

        std::cout << "Generating subdomains\n";
        tm["decompose"]->start();
        //original_domain->printVerboseDependencyGraph();
        original_domain->generateDecomposition(subdomain_list, x_subdivisions, y_subdivisions); 
        tm["decompose"]->stop();

        tm["send"]->start(); 
        subdomain = subdomain_list[0]; 
        for (int i = 1; i < comm_unit->getSize(); i++) {
            std::cout << "Sending subdomain[" << i << "]\n";
            comm_unit->sendObject(subdomain_list[i], i); 
        }
        tm["send"]->stop(); 

        printf("----------------------\nEND MASTER ONLY\n----------------------\n\n\n");

    } else {
        tm["settings"]->stop(); 
        cout << "MPI RANK " << comm_unit->getRank() << ": waiting to receive subdomain"
            << endl;

        tm["receive"]->start(); 
        subdomain = new Domain(); // EMPTY object that will be filled by MPI
        comm_unit->receiveObject(subdomain, 0); // Receive from CPU (0)
        tm["receive"]->stop(); 
    }

    comm_unit->barrier();

    if (debug) {
        subdomain->printVerboseDependencyGraph();
        subdomain->printNodeList("All Centers Needed by This Process"); 

        printf("CHECKING STENCILS: ");
        for (int irbf = 0; irbf < (int)subdomain->getStencilsSize(); irbf++) {
            //  printf("Stencil[%d] = ", irbf);
            StencilType& s = subdomain->getStencil(irbf); 
            if (irbf == s[0]) {
                //	printf("PASS\n");
                //    subdomain->printStencil(s, "S"); 
            } else {
                printf("FAIL on stencil %d\n", irbf);
                exit(EXIT_FAILURE);
            }
        }
        printf("OK\n");
    }

    RBFFD* der;
    if (use_gpu) {
        der = new RBFFD_CL(RBFFD::LAPL | RBFFD::X | RBFFD::Y | RBFFD::Z, subdomain, dim, comm_unit->getRank()); 
    } else {
        der = new RBFFD(RBFFD::LAPL | RBFFD::X | RBFFD::Y | RBFFD::Z, subdomain, dim, comm_unit->getRank()); 
    }

    int use_var_eps = settings->GetSettingAs<int>("USE_VAR_EPSILON", ProjectSettings::optional, "0");
    if (use_var_eps) {
        double alpha = settings->GetSettingAs<double>("VAR_EPSILON_ALPHA", ProjectSettings::optional, "1.0"); 
        double beta = settings->GetSettingAs<double>("VAR_EPSILON_BETA", ProjectSettings::optional, "1.0"); 
        //der->setVariableEpsilon(subdomain->getStencilRadii(), subdomain->getStencils(), alpha, beta); 
        der->setVariableEpsilon(alpha, beta); 
    } else {
        double epsilon = settings->GetSettingAs<double>("EPSILON", ProjectSettings::required);
        der->setEpsilon(epsilon);
    }

#if 0
        der->setWeightType(RBFFD::ContourSVD);
        der->setWeightType(RBFFD::Direct);
#else 
        der->setWeightType((RBFFD::WeightType)weight_method);
#endif 
 
    // Try loading all the weight files
    int err = der->loadFromFile(RBFFD::X); 
    err += der->loadFromFile(RBFFD::Y); 
    err += der->loadFromFile(RBFFD::Z); 
    err += der->loadFromFile(RBFFD::LAPL); 

    if (err) { 
        printf("start computing weights\n");
        tm["weights"]->start(); 

        // NOTE: good test for Direct vs Contour
        // Grid 11x11, vareps=0.05; Look at stencil 12. SHould have -100, 25,
        // 25, 25, 25 (i.e., -4,1,1,1,1) not sure why scaling is off.
        der->computeAllWeightsForAllStencils();
        tm["weights"]->stop(); 

        cout << "end computing weights" << endl;

        der->writeToFile(RBFFD::X);
        der->writeToFile(RBFFD::Y);
        der->writeToFile(RBFFD::Z);
        der->writeToFile(RBFFD::LAPL);

        cout << "end write weights to file" << endl;
    }

    if (settings->GetSettingAs<int>("RUN_DERIVATIVE_TESTS", ProjectSettings::optional, "1")) {
        bool weightsPreComputed = true; 
        bool exitIfTestFailed = settings->GetSettingAs<int>("BREAK_ON_DERIVATIVE_TESTS", ProjectSettings::optional, "1");
        tm["tests"]->start(); 
        // The test class only computes weights if they havent been done already
        DerivativeTests* der_test = new DerivativeTests(dim, der, subdomain, weightsPreComputed);
        if (use_gpu) {
            // Applies weights on both GPU and CPU and compares results for the first 10 stencils
            der_test->compareGPUandCPUDerivs(10);
        }
        // Test approximations to derivatives of functions f(x,y,z) = 0, x, y, xy, etc. etc.
        der_test->testAllFunctions(exitIfTestFailed);
        // For now we can only test eigenvalues on an MPI size of 1 (we could distribute with Par-Eiegen solver)
        if (comm_unit->getSize() == 1) {
            if (compute_eigenvalues) 
            {
                // FIXME: why does this happen? Perhaps because X Y and Z are unidirectional? 
                // Test X and 4 eigenvalues are > 0
                // Test Y and 30 are > 0
                // Test Z and 36 are > 0
                // NOTE: the 0 here implies we compute the eigenvalues but do not run the iterations of the random perturbation test
                der_test->testEigen(RBFFD::LAPL, 0);
            }
        }
        tm["tests"]->stop();
    }

    // SOLVE HEAT EQUATION

    ExactSolution* exact = getExactSolution(dim); 

    TimeDependentPDE* pde; 
    tm["heat_init"]->start(); 
    // We need to provide comm_unit to pass ghost node info
#if 0
    if (use_gpu) {
        pde = new HeatPDE(subdomain, der, comm_unit, true); 
    } else 
#endif
    { 
        // Implies initial conditions are generated
        // true here indicates the weights are computed. 
        pde = new HeatPDE(subdomain, der, comm_unit, uniformDiffusion, true);
    }
    pde->setStartEndTime(start_time, end_time);

    pde->fillInitialConditions(exact);

    // Broadcast updates for timestep, initial conditions for ghost nodes, etc. 
    tm["updates"]->start(); 
    comm_unit->broadcastObjectUpdates(pde);
    comm_unit->barrier();
    tm["updates"]->stop();

    tm["heat_init"]->stop(); 

    //TODO:    pde->setRelErrTol(max_global_rel_error); 

    // Setup a logging class that will monitor our iteration and dump intermediate files
#if USE_VTK
    // TODO: update VtuPDEWriter for the new PDE classes
    PDEWriter* writer = new VtuPDEWriter(subdomain, pde, comm_unit, local_sol_dump_frequency, global_sol_dump_frequency);
#else 
    PDEWriter* writer = new PDEWriter(subdomain, pde, comm_unit, local_sol_dump_frequency, global_sol_dump_frequency);
#endif 

    // Test DT: 
    // 1) get the minimum avg stencil radius (for stencil area--i.e., dx^2)
    double avgdx = 1000.;
    std::vector<StencilType>& sten = subdomain->getStencils();
    for (size_t i=0; i < sten.size(); i++) {
        double dx = subdomain->getStencilRadius(i);
        if (dx < avgdx) {
            avgdx = dx; 
        }
    }
    // Laplacian = d^2/dx^2
    double sten_area = avgdx*avgdx;
    // Not sure where Gordon came up with this parameter.
    // for second centered difference and euler time we have nu = 0.5
    double nu = 0.1;
    //          dt <= nu/dx^2 
    // is valid for stability in some FD schemes. 
    double max_dt = nu*(sten_area);
	printf("dt = %f (max_dt = %f; 0.5dx^2 = %f)\n", dt, max_dt, 0.5*sten_area);
    // This appears to be consistent with Chinchipatnam2006 (Thesis)
    // TODO: get more details on CFL for RBFFD
    // note: checking stability only works if we have all weights for all
    // nodes, so we dont do it in parallel
    if (compute_eigenvalues && (comm_unit->getSize() == 1)) {
        RBFFD::EigenvalueOutput eigs = der->getEigenvalues();
        max_dt = 2. / eigs.max_neg_eig;
        printf("Suggested max_dt based on eigenvalues (2/lambda_max)= %f\n", max_dt);
        
        // CFL condition:
        if (dt > max_dt) {
            std::cout << "WARNING! your choice of timestep (" << dt << ") is TOO LARGE for to maintain stability of system. According to eigenvalues, it must be less than " << max_dt << std::endl;
            if (use_eigen_dt) {
                dt = max_dt;
            } else {
                //exit(EXIT_FAILURE);
            }
        }
    }

    std::cout << "[MAIN] ********* USING TIMESTEP dt=" << dt << " ********** " << std::endl;

    //    subdomain->printCenterMemberships(subdomain->G, "G = " );
    //subdomain->printBoundaryIndices("INDICES OF GLOBAL BOUNDARY NODES: ");
    int iter;

    int num_iters = (int) ((end_time - start_time) / dt);
    std::cout << "NUM_ITERS = " << num_iters << std::endl;

    for (iter = 0; iter < num_iters && iter < max_num_iters; iter++) {
        writer->update(iter);


#if 0
        char label[256]; 
        sprintf(label, "LOCAL INPUT SOLUTION [local_indx (global_indx)] FOR ITERATION %d", iter); 
        pde->printSolution(label); 
#endif 

        tm["timestep"]->start(); 
        pde->advance((TimeDependentPDE::TimeScheme)timescheme, dt);
        tm["timestep"]->stop(); 

        // This just double checks that all procs have ghost node info.
        // pde->advance(..) should broadcast intermediate updates as needed,
        // but updated solution. 
        tm["updates"]->start(); 
        comm_unit->broadcastObjectUpdates(pde);
        comm_unit->barrier();
        tm["updates"]->stop();

        if (!(iter % local_sol_dump_frequency)) {

            std::cout << "\n*********** Rank " << comm_unit->getRank() << " Local Solution [ Iteration: " << iter << " (t = " << pde->getTime() << ") ] *************" << endl;
            pde->checkLocalError(exact, max_local_rel_error); 
        }
        if (!(iter % global_sol_dump_frequency)) {
            tm["consolidate"]->start(); 
            comm_unit->consolidateObjects(pde);
            comm_unit->barrier();
            tm["consolidate"]->stop(); 
            if (comm_unit->isMaster()) {
                std::cout << "\n*********** Global Solution [ Iteration: " << iter << " (t = " << pde->getTime() << ") ] *************" << endl;
                pde->checkGlobalError(exact, grid, max_global_rel_error); 
            }
        }
#if 0
        sprintf(label, "LOCAL UPDATED SOLUTION [local_indx (global_indx)] AFTER %d ITERATIONS", iter+1); 
        pde->printSolution(label); 
#endif 

        //        double nrm = pde->maxNorm();
        if (prompt_to_continue && comm_unit->isMaster()) {
            std::string buf; 
            cout << "Press [Enter] to continue" << std::endl;
            cin.get(); 
        }
    }
#if 1
    printf("after heat\n");

    // NOTE: all local subdomains have a U_G solution which is consolidated
    // into the MASTER process "global_U_G" solution. 
    tm["consolidate"]->start(); 
    comm_unit->consolidateObjects(pde);
    comm_unit->barrier();
    tm["consolidate"]->stop(); 
    //    subdomain->writeGlobalSolutionToFile(-1); 
    std::cout << "Checking Solution on Master\n";
    if (comm_unit->getRank() == 0) {
        pde->writeGlobalGridAndSolutionToFile(grid->getNodeList(), (char*) "FINAL_SOLUTION.txt");
#if 0
        // NOTE: the final solution is assembled, but we have to use the 
        // GLOBAL node list instead of a local subdomain node list
        cout << "FINAL ITER: " << iter << endl;
        std::vector<double> final_sol(grid->getNodeListSize()); 
        ifstream fin; 
        fin.open("FINAL_SOLUTION.txt"); 

        int count = 0; 
        for (int count = 0; count < final_sol.size(); count++) {
            Vec3 node; 
            double val;
            fin >> node[0] >> node[1] >> node[2] >> val;
            if (fin.good()) {
                final_sol[count] = val;
                // std::cout << "Read: " << node << ", " << final_sol[count] << std::endl; 
            }
        }
        fin.close();
#endif 
        std::cout << "============== Verifying Accuracy of Final Solution =============\n"; 
        pde->checkGlobalError(exact, grid, max_global_rel_error); 
        std::cout << "============== Solution Valid =============\n"; 

        delete(grid);
    }