C++ (Cpp) TAU_PROFILE_STOP Examples

Programming Language: C++ (Cpp)

Method/Function: TAU_PROFILE_STOP

Examples at hotexamples.com: 5

C++ (Cpp) TAU_PROFILE_STOP - 5 examples found. These are the top rated real world C++ (Cpp) examples of TAU_PROFILE_STOP extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: main.c Project: cdwdirect/sos_flow

int main (int argc, char *argv[]) 
{
    /*
     * Initialize TAU and start a timer for the main function.
     */
    TAU_INIT(&argc, &argv);
    TAU_PROFILE_SET_NODE(0);
    TAU_PROFILE_TIMER(tautimer, __func__, __FILE__, TAU_USER);
    TAU_PROFILE_START(tautimer);

    /*
     * Initialize MPI. We don't require threaded support, but with threads
     * we can send the TAU data over SOS asynchronously.
     */
    int rc = MPI_SUCCESS;
    int provided = 0;
    rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    if (rc != MPI_SUCCESS) {
        char *errorstring;
        int length = 0;
        MPI_Error_string(rc, errorstring, &length);
        printf("Error: MPI_Init failed, rc = %d\n%s\n", rc, errorstring);
        exit(1);
    }

    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    MPI_Comm_size(MPI_COMM_WORLD, &commsize);
    //my_printf("MPI_Init_thread: provided = %d, MPI_THREAD_MULTIPLE=%d\n", provided, MPI_THREAD_MULTIPLE);
    my_printf("%s Running with commsize %d\n", argv[0], commsize);

    /*
     * Initialize SOS. This will have been done in TAU, but in case we don't 
     * use TAU, we still want SOS action.
     */
    SOS_init_wrapper(&argc, &argv);

    /*
     * Run the worker code.
     */
    worker(argc, argv);

    /*
     * Finalize SOS and MPI
     */
    SOS_FINALIZE();
    MPI_Finalize();
    my_printf ("%s Done.\n", argv[0]);

    /*
     * Stop our main TAU timer. it probably will have been stopped in the MPI_Finalize call.
     */
    TAU_PROFILE_STOP(tautimer);
    return 0;
}

Example #2

Show file

File: worker_a.c Project: cdwdirect/sos_flow

int worker(int argc, char* argv[]) {
    TAU_PROFILE_TIMER(timer, __func__, __FILE__, TAU_USER);
    TAU_PROFILE_START(timer);
    static bool announced = false;
    my_printf("%d of %d In worker A\n", myrank, commsize);

    /* validate input */
    validate_input(argc, argv);

    my_printf("Worker A will execute %d iterations.\n", iterations);

    /* ADIOS: These declarations are required to match the generated
     *        gread_/gwrite_ functions.  (And those functions are
     *        generated by calling 'gpp.py adios_config.xml') ...
     */
    uint64_t  adios_groupsize;
    uint64_t  adios_totalsize;
    uint64_t  adios_handle;
    char      adios_filename[256];
    MPI_Comm  adios_comm;

    /* ADIOS: Can duplicate, split the world, whatever.
     *        This allows you to have P writers to N files.
     *        With no splits, everyone shares 1 file, but
     *        can write lock-free by using different areas.
     */
    //MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm);
    adios_comm = MPI_COMM_WORLD;

    int NX = 10;
    int NY = 1;
    double t[NX];
    double p[NX];

    /* ADIOS: Set up the adios communications and buffers, open the file.
     */
    if (send_to_b) {
        sprintf(adios_filename, "adios_a_to_b.bp");
        adios_init("adios_config.xml", adios_comm);
    }

    int index, i;
    for (index = 0 ; index < iterations ; index++ ) {
        /* Do some exchanges with neighbors */
        do_neighbor_exchange();
        /* "Compute" */
        compute(index);
        /* Write output */
        //my_printf("a");

        for (i = 0; i < NX; i++) {
            t[i] = index*100.0 + myrank*NX + i;
        }

        for (i = 0; i < NY; i++) {
            p[i] = index*1000.0 + myrank*NY + i;
        }

        if (send_to_b) {
            TAU_PROFILE_TIMER(adiostimer, "ADIOS send", __FILE__, TAU_USER);
            TAU_PROFILE_START(adiostimer);
            if (index == 0) {
                adios_open(&adios_handle, "a_to_b", adios_filename, "w", adios_comm);
            } else {
                adios_open(&adios_handle, "a_to_b", adios_filename, "a", adios_comm);
            }
            /* ADIOS: Actually write the data out.
            *        Yes, this is the recommended method, and this way, changes in
            *        configuration with the .XML file will, even in the worst-case
            *        scenario, merely require running 'gpp.py adios_config.xml'
            *        and typing 'make'.
            */
            #include "gwrite_a_to_b.ch"
            /* ADIOS: Close out the file completely and finalize.
            *        If MPI is being used, this must happen before MPI_Finalize().
            */
            adios_close(adios_handle);
            TAU_PROFILE_STOP(adiostimer);
            #if 1
            if (!announced) {
                SOS_val foo;
                foo.i_val = NX;
                SOS_pack(example_pub, "NX", SOS_VAL_TYPE_INT, foo);
                SOS_announce(example_pub);
                SOS_publish(example_pub);
                announced = true;
            }
            #endif
        }
        MPI_Barrier(MPI_COMM_WORLD);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    if (send_to_b) {
        adios_finalize(myrank);
    }
    my_printf("Worker A exting.\n");
    //MPI_Comm_free(&adios_comm);

    TAU_PROFILE_STOP(timer);
    /* exit */
    return 0;
}

Example #3

Show file

File: main.c Project: cdwdirect/sos_flow

int main (int argc, char *argv[]) 
{
    validate_input(argc, argv);

    /*
     * Initialize TAU and start a timer for the main function.
     */
    TAU_INIT(&argc, &argv);
    TAU_PROFILE_SET_NODE(0);
    TAU_PROFILE_TIMER(tautimer, __func__, my_name, TAU_USER);
    TAU_PROFILE_START(tautimer);

    /*
     * Initialize MPI. We don't require threaded support, but with threads
     * we can send the TAU data over SOS asynchronously.
     */
    int rc = MPI_SUCCESS;
    int provided = 0;
    rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    if (rc != MPI_SUCCESS) {
        char *errorstring;
        int length = 0;
        MPI_Error_string(rc, errorstring, &length);
        fprintf(stderr, "Error: MPI_Init failed, rc = %d\n%s\n", rc, errorstring);
        fflush(stderr);
        exit(99);
    }

    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
    my_printf("%s %s %d Running with comm_size %d\n", argv[0], my_name, getpid(), comm_size);
    MPI_Comm adios_comm;
    MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm);

    adios_init ("arrays.xml", adios_comm);

    /*
     * Loop and do the things
     */
    int iter = 0;
    char tmpstr[256] = {0};
    int * return_codes = (int *)(calloc(num_sources,sizeof(int)));
    while (iter < iterations) {
        int index;
        /*
         * Read upstream input
         */
        for (index = 0 ; index < num_sources ; index++) {
            if (return_codes[index] > 0) {
                my_printf("%s source is gone\n", sources[index]);
                continue; // this input is gone
            }
            my_printf ("%s reading from %s.\n", my_name, sources[index]);
            sprintf(tmpstr,"%s READING FROM %s", my_name, sources[index]);
            TAU_START(tmpstr);
            //mpi_reader(adios_comm, sources[index]);
            return_codes[index] = flexpath_reader(adios_comm, index);
            TAU_STOP(tmpstr);
        }
        /*
        * "compute"
        */
        my_printf ("%s computing.\n", my_name);
        compute(iter);
        bool time_to_go = (num_sources == 0) ? (iter == (iterations-1)) : true;
        for (index = 0 ; index < num_sources ; index++) {
            if (return_codes[index] == 0) {
                time_to_go = false;
                break; // out of this for loop
            }
        }
        /*
         * Send output downstream
         */
        for (index = 0 ; index < num_sinks ; index++) {
            my_printf ("%s writing to %s.\n", my_name, sinks[index]);
            sprintf(tmpstr,"%s WRITING TO %s", my_name, sinks[index]);
            TAU_START(tmpstr);
            //mpi_writer(adios_comm, sinks[index]);
            flexpath_writer(adios_comm, index, (iter > 0), time_to_go);
            TAU_STOP(tmpstr);
        }
        if (time_to_go) {
            break; // out of the while loop
        }
        my_printf ("%s not time to go...\n", my_name);
        iter++;
    }

    /*
     * Finalize ADIOS
     */
    const char const * dot_filename = ".finished";
    if (num_sources > 0) {
        adios_read_finalize_method(ADIOS_READ_METHOD_FLEXPATH);
    #if 0
    } else {
        while (true) {
            // assume this is the main process. It can't exit until 
            // the last process is done.
            if( access( dot_filename, F_OK ) != -1 ) {
                // file exists
                unlink(dot_filename);
                break;
            } else {
                // file doesn't exist
                sleep(1);
            }
        }
    #endif
    }
    if (num_sinks > 0) {
        adios_finalize (my_rank);
    #if 0
    } else {
        // assume this is the last process. 
        // Tell the main process we are done.
        FILE *file;
        if (file = fopen(dot_filename, "w")) {
            fprintf(file, "done.\n");
            fclose(file);
        }
    #endif
    }

    /*
     * Finalize MPI
     */
    MPI_Comm_free(&adios_comm);
    MPI_Finalize();
    my_printf ("%s Done.\n", my_name);

    TAU_PROFILE_STOP(tautimer);
    return 0;
}

Example #4

Show file

File: worker_b.c Project: cdwdirect/sos_flow

int worker(int argc, char* argv[]) {
    TAU_PROFILE_TIMER(timer, __func__, __FILE__, TAU_USER);
    TAU_PROFILE_START(timer);
    my_printf("%d of %d In worker B\n", myrank, commsize);
    static bool announced = false;

    /* validate input */
    validate_input(argc, argv);

    my_printf("Worker B will execute until it sees n iterations.\n", iterations);

    /* ADIOS: These declarations are required to match the generated
     *        gread_/gwrite_ functions.  (And those functions are
     *        generated by calling 'gpp.py adios_config.xml') ...
     *        EXCEPT THAT THE generation of Reader code is broken.
     *        So, we will write the reader code manually.
     */
    uint64_t  adios_groupsize;
    uint64_t  adios_totalsize;
    uint64_t  adios_handle;
    void * data = NULL;
    uint64_t start[2], count[2];
    int i, j, steps = 0;
    int NX = 10;
    int NY = 1;
    double t[NX];
    double p[NX];

    /* ADIOS: Can duplicate, split the world, whatever.
     *        This allows you to have P writers to N files.
     *        With no splits, everyone shares 1 file, but
     *        can write lock-free by using different areas.
     */
    MPI_Comm  adios_comm, adios_comm_b_to_c;
    adios_comm = MPI_COMM_WORLD;
    //MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm);
    adios_comm_b_to_c = MPI_COMM_WORLD;
    //MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm_b_to_c);

    enum ADIOS_READ_METHOD method = ADIOS_READ_METHOD_FLEXPATH;
    adios_read_init_method(method, adios_comm, "verbose=3");
    if (adios_errno != err_no_error) {
        fprintf (stderr, "rank %d: Error %d at init: %s\n", myrank, adios_errno, adios_errmsg());
        exit(4);
    }
    if (send_to_c) {
        adios_init("adios_config.xml", adios_comm);
    }

    /* ADIOS: Set up the adios communications and buffers, open the file.
    */
    ADIOS_FILE *fp; // file handler
    ADIOS_VARINFO *vi; // information about one variable 
    ADIOS_SELECTION * sel;
    char      adios_filename_a_to_b[256];
    char      adios_filename_b_to_c[256];
    enum ADIOS_LOCKMODE lock_mode = ADIOS_LOCKMODE_NONE;
    double timeout_sec = 1.0;
    sprintf(adios_filename_a_to_b, "adios_a_to_b.bp");
    sprintf(adios_filename_b_to_c, "adios_b_to_c.bp");
    my_printf ("rank %d: Worker B opening file: %s\n", myrank, adios_filename_a_to_b);
    fp = adios_read_open(adios_filename_a_to_b, method, adios_comm, lock_mode, timeout_sec);
    if (adios_errno == err_file_not_found) {
        fprintf (stderr, "rank %d: Stream not found after waiting %d seconds: %s\n",
        myrank, timeout_sec, adios_errmsg());
        exit(1);
    } else if (adios_errno == err_end_of_stream) {
        // stream has been gone before we tried to open
        fprintf (stderr, "rank %d: Stream terminated before open. %s\n", myrank, adios_errmsg());
        exit(2);
    } else if (fp == NULL) {
        // some other error happened
        fprintf (stderr, "rank %d: Error %d at opening: %s\n", myrank, adios_errno, adios_errmsg());
        exit(3);
    } else {
        my_printf("Found file %s\n", adios_filename_a_to_b);
        my_printf ("File info:\n");
        my_printf ("  current step:   %d\n", fp->current_step);
        my_printf ("  last step:      %d\n", fp->last_step);
        my_printf ("  # of variables: %d:\n", fp->nvars);

        vi = adios_inq_var(fp, "temperature");
        adios_inq_var_blockinfo(fp, vi);

        printf ("ndim = %d\n",  vi->ndim);
        printf ("nsteps = %d\n",  vi->nsteps);
        printf ("dims[%llu][%llu]\n",  vi->dims[0], vi->dims[1]);

        uint64_t slice_size = vi->dims[0]/commsize;
        if (myrank == commsize-1) {
            slice_size = slice_size + vi->dims[0]%commsize;
        }

        start[0] = myrank * slice_size;
        count[0] = slice_size;
        start[1] = 0;
        count[1] = vi->dims[1];

        data = malloc (slice_size * vi->dims[1] * 8);

        /* Processing loop over the steps (we are already in the first one) */
        while (adios_errno != err_end_of_stream && steps < iterations) {
            steps++; // steps start counting from 1

            TAU_PROFILE_TIMER(adios_recv_timer, "ADIOS recv", __FILE__, TAU_USER);
            TAU_PROFILE_START(adios_recv_timer);
            sel = adios_selection_boundingbox (vi->ndim, start, count);
            adios_schedule_read (fp, sel, "temperature", 0, 1, data);
            adios_perform_reads (fp, 1);

            if (myrank == 0)
                printf ("--------- B Step: %d --------------------------------\n",
                        fp->current_step);

#if 0
            printf("B rank=%d: [0:%lld,0:%lld] = [", myrank, vi->dims[0], vi->dims[1]);
            for (i = 0; i < slice_size; i++) {
                printf (" [");
                for (j = 0; j < vi->dims[1]; j++) {
                    printf ("%g ", *((double *)data + i * vi->dims[1] + j));
                }
                printf ("]");
            }
            printf (" ]\n\n");
#endif

            // advance to 1) next available step with 2) blocking wait
            adios_advance_step (fp, 0, timeout_sec);
            if (adios_errno == err_step_notready)
            {
                printf ("B rank %d: No new step arrived within the timeout. Quit. %s\n",
                        myrank, adios_errmsg());
                break; // quit while loop
            }
            TAU_PROFILE_STOP(adios_recv_timer);

            /* Do some exchanges with neighbors */
            //do_neighbor_exchange();
            /* "Compute" */
            compute(steps);

            for (i = 0; i < NX; i++) {
                t[i] = steps*100.0 + myrank*NX + i;
            }

            for (i = 0; i < NY; i++) {
                p[i] = steps*1000.0 + myrank*NY + i;
            }

            if (send_to_c) {
                TAU_PROFILE_TIMER(adios_send_timer, "ADIOS send", __FILE__, TAU_USER);
                TAU_PROFILE_START(adios_send_timer);
                /* ADIOS: write to the next application in the workflow */
                if (steps == 0) {
                    adios_open(&adios_handle, "b_to_c", adios_filename_b_to_c, "w", adios_comm_b_to_c);
                } else {
                    adios_open(&adios_handle, "b_to_c", adios_filename_b_to_c, "a", adios_comm_b_to_c);
                }
                /* ADIOS: Actually write the data out.
                *        Yes, this is the recommended method, and this way, changes in
                *        configuration with the .XML file will, even in the worst-case
                *        scenario, merely require running 'gpp.py adios_config.xml'
                *        and typing 'make'.
                */
                #include "gwrite_b_to_c.ch"
                /* ADIOS: Close out the file completely and finalize.
                *        If MPI is being used, this must happen before MPI_Finalize().
                */
                adios_close(adios_handle);
                TAU_PROFILE_STOP(adios_send_timer);
            #if 1
            if (!announced) {
                SOS_val foo;
                foo.i_val = NX;
                SOS_pack(example_pub, "NX", SOS_VAL_TYPE_INT, foo);
                SOS_announce(example_pub);
                SOS_publish(example_pub);
                announced = true;
            }
            #endif
            }
            MPI_Barrier(adios_comm_b_to_c);
        }
        MPI_Barrier(MPI_COMM_WORLD);
        adios_read_close(fp);
        /* ADIOS: Close out the file completely and finalize.
        *        If MPI is being used, this must happen before MPI_Finalize().
        */
        adios_read_finalize_method(method);
    }
    if (send_to_c) {
        adios_finalize(myrank);
    }

    free(data);
    //MPI_Comm_free(&adios_comm);
    //MPI_Comm_free(&adios_comm_b_to_c);

    TAU_PROFILE_STOP(timer);
    /* exit */
    return 0;
}

Example #5

Show file

File: convect.C Project: 8l/rose

int 
main(int argc, char** argv)
{
  ios::sync_with_stdio();

#ifdef USE_TAU
  TAU_PROFILE_INIT(argc,argv);
  TAU_PROFILE("main","int (int argc, char** argv)",TAU_DEFAULT);
  TAU_PROFILE_TIMER(loop_timer,"Computation Loop","", TAU_USER);
  TAU_PROFILE_TIMER(bc_timer,"Boundary Condition","", TAU_USER);
  TAU_PROFILE_TIMER(update_timer,"Ghost Boundary Update","", TAU_USER);
  TAU_PROFILE_TIMER(copy_timer,"Array Copy","", TAU_USER);
#endif

  int Number_of_Processors=0;
  
  Optimization_Manager::setForceVSG_Update(Off);
  Optimization_Manager::Initialize_Virtual_Machine("",Number_of_Processors,argc,argv);
  Partitioning_Type::SpecifyDefaultInternalGhostBoundaryWidths(1);
  Optimization_Manager::setForceVSG_Update(Off);
  
  Index::setBoundsCheck(off);

  int myid = Communication_Manager::My_Process_Number;
  int numProcs = Communication_Manager::Number_Of_Processors;
  
  const int Xsize=1003*numProcs, Ysize=1003;

  const Range ix(-1,Xsize-2), iy(-1,Ysize-2), all;
  const Range ix1(0,Xsize-3), iy1(0,Ysize-3);
  
  Partitioning_Type thePartition;
  thePartition.SpecifyDecompositionAxes(1);

  doubleArray A(ix,iy), old_A(ix,iy), x(ix,iy), y(ix,iy), temp(ix,iy);
  const double dx=1./(Xsize-3), dy=1./(Ysize-3), dt=0.1/(Xsize+Ysize);
  double theTime=0.0,maxError;
  
  A.partition(thePartition);
  old_A.partition(thePartition);
  temp.partition(thePartition);
  x.partition(thePartition);
  y.partition(thePartition);
  
  intSerialArray theProcessorSet = (A.getPartition()).getProcessorSet();  
  
  doubleSerialArray xlocal = x.getLocalArray();
  doubleSerialArray ylocal = y.getLocalArray();  
  
  cout<<"size of xlocal: "<<xlocal.getSize()<<endl;
  
  Optimization_Manager::setOptimizedScalarIndexing(On);
  for( int i=xlocal.getBase(0);i<=xlocal.getBound(0);i++)
    xlocal(i,all) = dx*i;

  for( int j=ylocal.getBase(1);j<=ylocal.getBound(1);j++)
    ylocal(all,j) = dy*j;
  Optimization_Manager::setOptimizedScalarIndexing(Off);

  x.updateGhostBoundaries();
  y.updateGhostBoundaries();  

  A = (1.0 + theTime)*(2.0 + x + y);

  doubleSerialArray oldALocal = old_A.getLocalArray();
  doubleSerialArray ALocal = A.getLocalArray();
  doubleSerialArray tempLocal = temp.getLocalArray();

  int iLower = (myid>0)?ALocal.getBase(0)+1:ALocal.getBase(0)+1;
  int iUpper = (myid<numProcs-1)?ALocal.getBound(0)-1:ALocal.getBound(0)-1;
  int jLower = ALocal.getBase(1)+1;
  int jUpper = ALocal.getBound(1)-1;
  
  Range ILocInterior(iLower,iUpper);
  Range JLocInterior(jLower,jUpper);
  
  iLower = (myid>0)?ALocal.getBase(0)+1:ALocal.getBase(0);
  iUpper = (myid<numProcs-1)?ALocal.getBound(0)-1:ALocal.getBound(0);
  jLower = ALocal.getBase(1);
  jUpper = ALocal.getBound(1);
  
  Range ILoc(iLower,iUpper);
  Range JLoc(jLower,jUpper);

  if (myid == 0)  
    cout << "-----Starting computation" << endl; 

  double t = MPI_Wtime();

#ifdef USE_TAU
  TAU_PROFILE_START(copy_timer);
#endif

  oldALocal = ALocal;

#ifdef USE_TAU
  TAU_PROFILE_STOP(copy_timer);
  TAU_PROFILE_START(update_timer);
#endif
  
  old_A.updateGhostBoundaries();

#ifdef USE_TAU
  TAU_PROFILE_STOP(update_timer);
  TAU_PROFILE_START(loop_timer);
#endif

  tempLocal(ILocInterior,JLocInterior) = ALocal(ILocInterior,JLocInterior) -
    dt*( ( ALocal(ILocInterior+1,JLocInterior) - ALocal(ILocInterior-1,JLocInterior) ) / (2.0*dx) + 
         ( ALocal(ILocInterior,JLocInterior+1) - ALocal(ILocInterior,JLocInterior-1) ) / (2.0*dy) -
         (4.0 + 2.0*theTime + xlocal(ILocInterior,JLocInterior) + ylocal(ILocInterior,JLocInterior)) );
  
#ifdef USE_TAU
  TAU_PROFILE_STOP(loop_timer);
  TAU_PROFILE_START(copy_timer);
#endif

  //
  // copy temp into A
  //
  ALocal(ILocInterior,JLocInterior) = tempLocal(ILocInterior,JLocInterior);
  
#ifdef USE_TAU
  TAU_PROFILE_STOP(copy_timer);
  TAU_PROFILE_START(bc_timer);
#endif

  A(all,Ysize-2) = (1.0+(theTime+dt))*(2.0+x(all,Ysize-2)+y(all,Ysize-2));
  A(all,-1) = (1.0+(theTime+dt))*(2.0+x(all,-1)+y(all,-1));
  A(-1,iy1) = (1.0+(theTime+dt))*(2.0+x(-1,iy1)+y(-1,iy1));
  A(Xsize-2,iy1) = (1.0+(theTime+dt))*(2.0+x(Xsize-2,iy1)+y(Xsize-2,iy1));

#ifdef USE_TAU
  TAU_PROFILE_STOP(bc_timer);
  TAU_PROFILE_START(update_timer);
#endif

  A.updateGhostBoundaries();

#ifdef USE_TAU
  TAU_PROFILE_STOP(update_timer);
#endif

  theTime += dt;
  
  for (int k = 0; k<100; k++)
  {
      
#ifdef USE_TAU
    TAU_PROFILE_START(loop_timer);
#endif

    tempLocal(ILocInterior,JLocInterior) = oldALocal(ILocInterior,JLocInterior) - 
      2.*dt*( ( ALocal(ILocInterior+1,JLocInterior) - ALocal(ILocInterior-1,JLocInterior) ) / (2.0*dx) + 
              ( ALocal(ILocInterior,JLocInterior+1) - ALocal(ILocInterior,JLocInterior-1) ) / (2.0*dy) -
              (4.0 + 2.0*theTime + xlocal(ILocInterior,JLocInterior) + ylocal(ILocInterior,JLocInterior))  );


#ifdef USE_TAU
    TAU_PROFILE_STOP(loop_timer);
    TAU_PROFILE_START(copy_timer);
#endif

    oldALocal(ILoc,JLoc) = ALocal(ILoc,JLoc);
    ALocal(ILoc,JLoc) = tempLocal(ILoc,JLoc);

#ifdef USE_TAU
    TAU_PROFILE_STOP(copy_timer);
    TAU_PROFILE_START(bc_timer);
#endif

    A(all,Ysize-2) = (1.0+(theTime+dt))*(2.0+x(all,Ysize-2)+y(all,Ysize-2));
    A(all,-1) = (1.0+(theTime+dt))*(2.0+x(all,-1)+y(all,-1));
    A(-1,iy1) = (1.0+(theTime+dt))*(2.0+x(-1,iy1)+y(-1,iy1));
    A(Xsize-2,iy1) = (1.0+(theTime+dt))*(2.0+x(Xsize-2,iy1)+y(Xsize-2,iy1));
  
#ifdef USE_TAU
    TAU_PROFILE_STOP(bc_timer);
    TAU_PROFILE_START(update_timer);
#endif

    A.updateGhostBoundaries();

#ifdef USE_TAU
  TAU_PROFILE_STOP(update_timer);
#endif

    theTime += dt;
  }
  t = MPI_Wtime() - t;
  double maxTime;

  MPI_Reduce(&t, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

  if (myid == 0)  
  {
    cout << "   Total Time= " << maxTime<< " s" << endl; 
  }
  
  maxError = max(fabs(A(ix,iy)-(1.0+theTime)*(2.0+x(ix,iy)+y(ix,iy))));
  if (myid == 0)  
  {
    cout << "max error at t="<<theTime<<"  is: "<<maxError<<endl;
    cout << "number of messages sent= "<<Diagnostic_Manager::getNumberOfMessagesSent()<<endl;
    cout << "number of messages received= "<<Diagnostic_Manager::getNumberOfMessagesReceived()<<endl;

    ofstream OutFile;
    OutFile.open("/p/gb1/bmiller/ConvectPpp/ConvectPppNew.ConstSizePerProc.out",ios::app);
    OutFile << numProcs << "  " << maxTime << "  " << maxError << endl;
    OutFile.close();
  }

#ifdef USE_TAU
  TAU_PROFILE_EXIT("Finishing Profiling.");
#endif

  Optimization_Manager::Exit_Virtual_Machine();

  return 0;
}