Example #1
0
int
init_accel (void)
{
#if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_)
     char * str;
     int local_rank, dev_count;
     int dev_id = 0;
#endif
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
#endif

    switch (options.accel) {
#ifdef _ENABLE_CUDA_
        case managed:
        case cuda:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                cudaGetDeviceCount(&dev_count);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            curesult = cuInit(0);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuDeviceGet(&cuDevice, dev_id);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuCtxCreate(&cuContext, 0, cuDevice);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
            break;
#endif   
#ifdef _ENABLE_OPENACC_
        case openacc:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                dev_count = acc_get_num_devices(acc_device_not_host);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            acc_set_device_num (dev_id, acc_device_not_host);
            break;
#endif   
        default:
            fprintf(stderr, "Invalid device type, should be cuda or openacc\n");
            return 1;
    }

    return 0;
}
Example #2
0
static void initGPU(int argc, char** argv) {
        // gets the device id (if specified) to run on
        int devId = -1;
        if (argc > 1) {
                devId = atoi(argv[1]);
                int devCount = acc_get_num_devices(acc_device_nvidia);
                if (devId < 0 || devId >= devCount) {
                        printf("The specified device ID is not supported.\n");
                        exit(1);
                }
        }
        if (devId != -1) {
                acc_set_device_num(devId, acc_device_nvidia);
        }
        // creates a context on the GPU just to
        // exclude initialization time from computations
        acc_init(acc_device_nvidia);

        // print device id
        devId = acc_get_device_num(acc_device_nvidia);
        printf("Running on GPU with ID %d.\n\n", devId);

}
Example #3
0
int main(int argc, char* argv[])
{
    acc_set_device_num(0, acc_device_nvidia);

    // read command line arguments
    readcmdline(&options, argc, argv);
    int nx = options.nx;
    int ny = options.ny;
    int N  = options.N;
    int nt = options.nt;

    printf("========================================================================\n");
    printf("                      Welcome to mini-stencil!\n");
    printf("mesh :: %d * %d, dx = %f\n", nx, ny, options.dx);
    printf("time :: %d, time steps from 0 .. %f\n", nt, options.nt * options.dt);
    printf("========================================================================\n");

    // allocate global fields
    x_new = (double*) malloc(sizeof(double)*nx*ny);
    x_old = (double*) malloc(sizeof(double)*nx*ny); 
    bndN  = (double*) malloc(sizeof(double)*nx);
    bndS  = (double*) malloc(sizeof(double)*nx); 
    bndE  = (double*) malloc(sizeof(double)*ny); 
    bndW  = (double*) malloc(sizeof(double)*ny); 

    double* b      = (double*) malloc(N*sizeof(double));
    double* deltax = (double*) malloc(N*sizeof(double));

    // set dirichlet boundary conditions to 0 all around
    memset(x_old, 0, sizeof(double) * nx * ny);
    memset(bndN, 0, sizeof(double) * nx);
    memset(bndS, 0, sizeof(double) * nx);
    memset(bndE, 0, sizeof(double) * ny);
    memset(bndW, 0, sizeof(double) * ny);
    memset(deltax, 0, sizeof(double) * N);

    // set the initial condition
    // a circle of concentration 0.1 centred at (xdim/4, ydim/4) with radius
    // no larger than 1/8 of both xdim and ydim
    memset(x_new, 0, sizeof(double) * nx * ny);
    double xc = 1.0 / 4.0;
    double yc = (ny - 1) * options.dx / 4;
    double radius = fmin(xc, yc) / 2.0;
    int i,j;
    //
    for (j = 0; j < ny; j++)
    {
        double y = (j - 1) * options.dx;
        for (i = 0; i < nx; i++)
        {
            double x = (i - 1) * options.dx;
            if ((x - xc) * (x - xc) + (y - yc) * (y - yc) < radius * radius)
                //((double(*)[nx])x_new)[j][i] = 0.1;
                x_new[i+j*nx] = 0.1;
        }
    }

    flops_bc = 0;
    flops_diff = 0;
    flops_blas1 = 0;
    verbose_output = 0;
    iters_cg = 0;
    iters_newton = 0;

    // initialize temporary storage fields used by the cg solver
    // I do this here so that the fields are persistent between calls
    // to the CG solver. This is useful if we want to avoid malloc/free calls
    // on the device for the OpenACC implementation (feel free to suggest a better
    // method for doing this)
    printf("INITIALIZING CG STATE\n");
    Ap    = (double*) malloc(N*sizeof(double));
    r     = (double*) malloc(N*sizeof(double)); 
    p     = (double*) malloc(N*sizeof(double));
    Fx    = (double*) malloc(N*sizeof(double));
    Fxold = (double*) malloc(N*sizeof(double));
    v     = (double*) malloc(N*sizeof(double));
    xold  = (double*) malloc(N*sizeof(double));

    int cg_converged = 1;

    double timespent;

    // start timer
    timespent = -omp_get_wtime();

    // main timeloop
    double tolerance = 1.e-6;
    int timestep;

    for (timestep = 1; timestep <= nt; timestep++)
      {
	// set x_new and x_old to be the solution
	ss_copy(x_old, x_new, N);

	double residual;
	int    converged = 0;
	int    it = 1;
	for ( ; it <= 50; it++)
	  {
	    // compute residual : requires both x_new and x_old
	    diffusion(x_new, b);
	    residual = ss_norm2(b, N);

	    // check for convergence
	    if (residual < tolerance)
	      {
		converged = 1;
		break;
	      }

	    // solve linear system to get -deltax
	    ss_cg(deltax, b, 200, tolerance, &cg_converged);

	    // check that the CG solver converged
	    if (!cg_converged) break;

	    // update solution
	    ss_axpy(x_new, -1.0, deltax, N);
	  }
	iters_newton += it;

	// output some statistics
	//if (converged && verbose_output)
	if (converged && verbose_output)
	  printf("step %d required %d iterations for residual %E\n", timestep, it, residual);
	if (!converged)
	  {
	    fprintf(stderr, "step %d ERROR : nonlinear iterations failed to converge\n", timestep);
	    break;
	  }
      }

    // get times
    timespent += omp_get_wtime();
    unsigned long long flops_total = flops_diff + flops_blas1;

    ////////////////////////////////////////////////////////////////////
    // write final solution to BOV file for visualization
    ////////////////////////////////////////////////////////////////////

    // binary data
    {
        FILE* output = fopen("output.bin", "w");
        fwrite(x_new, sizeof(double), nx * ny, output);
        fclose(output);
    }

    // metadata
    {
        FILE* output = fopen("output.bov", "wb");
        fprintf(output, "TIME: 0.0\n");
        fprintf(output, "DATA_FILE: output.bin\n");
        fprintf(output, "DATA_SIZE: %d, %d, 1\n", nx, ny);
        fprintf(output, "DATA_FORMAT: DOUBLE\n");
        fprintf(output, "VARIABLE: phi\n");
        fprintf(output, "DATA_ENDIAN: LITTLE\n");
        fprintf(output, "CENTERING: nodal\n");
        //fprintf(output, "BYTE_OFFSET: 4\n");
        fprintf(output, "BRICK_SIZE: 1.0 %f 1.0\n", (ny - 1) * options.dx);
        fclose(output);
    }

    // print table sumarizing results
    printf("--------------------------------------------------------------------------------\n");
    printf("simulation took %f seconds (%f GFLOP/s)\n", timespent, flops_total / 1e9 / timespent);
    printf("%u conjugate gradient iterations\n", iters_cg);
    printf("%u newton iterations\n", iters_newton);
    printf("--------------------------------------------------------------------------------\n");

    // deallocate global fields
    free (x_new);
    free (x_old);
    free (bndN);
    free (bndS);
    free (bndE);
    free (bndW);

    printf("Goodbye!\n");

    return 0;
}
int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const float pi  = 2.0 * asinf(1.0f);
    const float tol = 1.0e-5f;

    int rank = 0;
    int size = 1;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    memset(A, 0, N * M * sizeof(float));
    memset(Aref, 0, N * M * sizeof(float));
    
    // set boundary conditions
    for (int j = 0; j < N; j++)
    {
        float y0     = sinf( 2.0 * pi * j / (N-1));
        A[j][0]      = y0;
        A[j][M-1]    = y0;
        Aref[j][0]   = y0;
        Aref[j][M-1] = y0;
    }
    
#if _OPENACC
    int ngpus=acc_get_num_devices(acc_device_nvidia);
    int devicenum=rank%ngpus;
    acc_set_device_num(devicenum,acc_device_nvidia);

    // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems
    acc_init(acc_device_nvidia);
#endif /*_OPENACC*/

    // Ensure correctness if N%size != 0
    int chunk_size = ceil( (1.0*N)/size );
    
    int jstart = rank * chunk_size;
    int jend   = jstart + chunk_size;
    
    // Do not process boundaries
    jstart = max( jstart, 1 );
    jend = min( jend, N - 1 );
    
    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M);

    if ( rank == 0) printf("Calculate reference solution and time serial execution.\n");
    StartTimer();
    laplace2d_serial( rank, iter_max, tol );
    double runtime_serial = GetTimer();

    //Wait for all processes to ensure correct timing of the parallel version
    MPI_Barrier( MPI_COMM_WORLD );
    if ( rank == 0) printf("Parallel execution.\n");
    StartTimer();
    int iter  = 0;
    float error = 1.0f;
    
    #pragma acc data copy(A) create(Anew)
    while ( error > tol && iter < iter_max )
    {
        error = 0.f;

        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1]
                                     + A[j-1][i] + A[j+1][i]);
                error = fmaxf( error, fabsf(Anew[j][i]-A[j][i]));
            }
        }
        float globalerror = 0.0f;
        MPI_Allreduce( &error, &globalerror, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD );
        error = globalerror;
        
        //TODO: Split into halo and bulk part 
        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                A[j][i] = Anew[j][i];
            }
        }
        //TODO: Start bulk part asynchronously

        //Periodic boundary conditions
        int top    = (rank == 0) ? (size-1) : rank-1;
        int bottom = (rank == (size-1)) ? 0 : rank+1;

        #pragma acc host_data use_device( A )
        {
            //1. Sent row jstart (first modified row) to top receive lower boundary (jend) from bottom
            MPI_Sendrecv( A[jstart], M, MPI_FLOAT, top   , 0, A[jend], M, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent row (jend-1) (last modified row) to bottom receive upper boundary (jstart-1) from top
            MPI_Sendrecv( A[(jend-1)], M, MPI_FLOAT, bottom, 0, A[(jstart-1)], M, MPI_FLOAT, top   , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        
        //TODO: wait for bulk part
        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }
    MPI_Barrier( MPI_COMM_WORLD );
    double runtime = GetTimer();

    if (check_results( rank, jstart, jend, tol ) && rank == 0)
    {
        printf( "Num GPUs: %d\n", size );
        printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
    }
    MPI_Finalize();
    return 0;
}
Example #5
0
void blur5_pipelined_multi(unsigned restrict char *imgData, unsigned restrict char *out, long w, long h, long ch, long step)
{
  const int filtersize = 5, nblocks = 32;
  double filter[5][5] =
  {
     1,  1,  1,  1,  1,
     1,  2,  2,  2,  1,
     1,  2,  3,  2,  1,
     1,  2,  2,  2,  1,
     1,  1,  1,  1,  1
  };
  // The denominator for scale should be the sum
  // of non-zero elements in the filter.
  float scale = 1.0 / 35.0;

  long blocksize = h/ nblocks;
#pragma omp parallel num_threads(acc_get_num_devices(acc_device_nvidia))
  {
    printf("Thread %d of %d\n", omp_get_thread_num(), omp_get_num_threads());
    int myid = omp_get_thread_num();
    acc_set_device_num(myid,acc_device_nvidia);
    int queue = 1;
    // TODO: create a data region
  {
#pragma omp for schedule(static)
  for ( long blocky = 0; blocky < nblocks; blocky++)
  {
    // For data copies we need to include the ghost zones for the filter
    long starty = MAX(0,blocky * blocksize - filtersize/2);
    long endy   = MIN(h,starty + blocksize + filtersize/2);
    // TODO: move data
    starty = blocky * blocksize;
    endy = starty + blocksize;
    // TODO: parallelize this loop
    for ( long y = starty; y < endy; y++ )
    {
      for ( long x = 0; x < w; x++ )
      {
        float blue = 0.0, green = 0.0, red = 0.0;
        for ( int fy = 0; fy < filtersize; fy++ )
        {
          long iy = y - (filtersize/2) + fy;
          for ( int fx = 0; fx < filtersize; fx++ )
          {
            long ix = x - (filtersize/2) + fx;
            if ( (iy<0)  || (ix<0) ||
                (iy>=h) || (ix>=w) ) continue;
            blue  += filter[fy][fx] * (float)imgData[iy * step + ix * ch];
            green += filter[fy][fx] * (float)imgData[iy * step + ix * ch + 1];
            red   += filter[fy][fx] * (float)imgData[iy * step + ix * ch + 2];
          }
        }
        out[y * step + x * ch]      = 255 - (scale * blue);
        out[y * step + x * ch + 1 ] = 255 - (scale * green);
        out[y * step + x * ch + 2 ] = 255 - (scale * red);
      }
    }
    // TODO: move data
    queue = (queue%3)+1;
  }
  // TODO: create synchronization point
  }
  }
}
int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const float pi  = 2.0 * asinf(1.0f);
    const float tol = 1.0e-5f;

    int rank = 0;
    int size = 1;

    //TODO: Initialize MPI and determine rank and size
    //int MPI_Init(int *argc, char ***argv);
    //int MPI_Comm_rank(MPI_COMM_WORLD, int *rank);
    //int MPI_Comm_size(MPI_COMM_WORLD, int *size)

    memset(A, 0, N * M * sizeof(float));
    memset(Aref, 0, N * M * sizeof(float));
    
    // set boundary conditions
    for (int j = 0; j < N; j++)
    {
        float y0     = sinf( 2.0 * pi * j / (N-1));
        A[j][0]      = y0;
        A[j][M-1]    = y0;
        Aref[j][0]   = y0;
        Aref[j][M-1] = y0;
    }
    
#if _OPENACC
    int ngpus=acc_get_num_devices(acc_device_nvidia);
    //TODO: choose device to use by this rank
    int devicenum=0;
    acc_set_device_num(devicenum,acc_device_nvidia);

    // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems
    acc_init(acc_device_nvidia);
#endif /*_OPENACC*/

    int jstart = 1;
    int jend   = N-1;

    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M);

    if ( rank == 0) printf("Calculate reference solution and time serial execution.\n");
    StartTimer();
    laplace2d_serial( rank, iter_max, tol );
    double runtime_serial = GetTimer();

    //TODO: Wait for all processes to ensure correct timing of the parallel version
    //int MPI_Barrier( MPI_COMM_WORLD );
    if ( rank == 0) printf("Parallel execution.\n");
    StartTimer();
    int iter  = 0;
    float error = 1.0f;
    
    #pragma acc data copy(A) create(Anew)
    while ( error > tol && iter < iter_max )
    {
        error = 0.f;

        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1]
                                     + A[j-1][i] + A[j+1][i]);
                error = fmaxf( error, fabsf(Anew[j][i]-A[j][i]));
            }
        }
        
        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                A[j][i] = Anew[j][i];
            }
        }

        //Periodic boundary conditions
        #pragma acc kernels
        for( int i = 1; i < M-1; i++ )
        {
                A[0][i]     = A[(N-2)][i];
                A[(N-1)][i] = A[1][i];
        }
        
        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }
    //TODO: Wait for all processes to ensure correct timing of the parallel version
    //int MPI_Barrier( MPI_COMM_WORLD );
    double runtime = GetTimer();

    if (check_results( rank, jstart, jend, tol ) && rank == 0)
    {
        printf( "Num GPUs: %d\n", size );
        printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
    }
    //TODO: Finalize MPI
    //int MPI_Finalize();
    return 0;
}
Example #7
0
int
main (int argc, char **argv)
{
    cublasStatus_t s;
    cublasHandle_t h;
    CUcontext pctx;
    CUresult r;
    int i;
    const int N = 256;
    float *h_X, *h_Y1, *h_Y2;
    float *d_X,*d_Y;
    float alpha = 2.0f;
    float error_norm;
    float ref_norm;

    /* Test 4 - OpenACC creates, cuBLAS shares.  */

    acc_set_device_num (0, acc_device_nvidia);

    r = cuCtxGetCurrent (&pctx);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
        exit (EXIT_FAILURE);
    }

    h_X = (float *) malloc (N * sizeof (float));
    if (h_X == 0)
    {
        fprintf (stderr, "malloc failed: for h_X\n");
        exit (EXIT_FAILURE);
    }

    h_Y1 = (float *) malloc (N * sizeof (float));
    if (h_Y1 == 0)
    {
        fprintf (stderr, "malloc failed: for h_Y1\n");
        exit (EXIT_FAILURE);
    }

    h_Y2 = (float *) malloc (N * sizeof (float));
    if (h_Y2 == 0)
    {
        fprintf (stderr, "malloc failed: for h_Y2\n");
        exit (EXIT_FAILURE);
    }

    for (i = 0; i < N; i++)
    {
        h_X[i] = rand () / (float) RAND_MAX;
        h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX;
    }

#pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha)
    {
        int i;

        for (i = 0; i < N; i++)
        {
            h_Y2[i] = alpha * h_X[i] + h_Y2[i];
        }
    }

    r = cuCtxGetCurrent (&pctx);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
        exit (EXIT_FAILURE);
    }

    d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float));
    if (d_X == NULL)
    {
        fprintf (stderr, "copyin error h_Y1\n");
        exit (EXIT_FAILURE);
    }

    d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float));
    if (d_Y == NULL)
    {
        fprintf (stderr, "copyin error h_Y1\n");
        exit (EXIT_FAILURE);
    }

    s = cublasCreate (&h);
    if (s != CUBLAS_STATUS_SUCCESS)
    {
        fprintf (stderr, "cublasCreate failed: %d\n", s);
        exit (EXIT_FAILURE);
    }

    context_check (pctx);

    s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1);
    if (s != CUBLAS_STATUS_SUCCESS)
    {
        fprintf (stderr, "cublasSaxpy failed: %d\n", s);
        exit (EXIT_FAILURE);
    }

    context_check (pctx);

    acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float));

    context_check (pctx);

    error_norm = 0;
    ref_norm = 0;

    for (i = 0; i < N; ++i)
    {
        float diff;

        diff = h_Y1[i] - h_Y2[i];
        error_norm += diff * diff;
        ref_norm += h_Y2[i] * h_Y2[i];
    }

    error_norm = (float) sqrt ((double) error_norm);
    ref_norm = (float) sqrt ((double) ref_norm);

    if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f))
    {
        fprintf (stderr, "math error\n");
        exit (EXIT_FAILURE);
    }

    free (h_X);
    free (h_Y1);
    free (h_Y2);

    acc_free (d_X);
    acc_free (d_Y);

    context_check (pctx);

    s = cublasDestroy (h);
    if (s != CUBLAS_STATUS_SUCCESS)
    {
        fprintf (stderr, "cublasDestroy failed: %d\n", s);
        exit (EXIT_FAILURE);
    }

    context_check (pctx);

    acc_shutdown (acc_device_nvidia);

    r = cuCtxGetCurrent (&pctx);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
        exit (EXIT_FAILURE);
    }

    if (pctx)
    {
        fprintf (stderr, "Unexpected context\n");
        exit (EXIT_FAILURE);
    }

    return EXIT_SUCCESS;
}
Example #8
0
void ops_init_backend() {
  acc_set_device_num(ops_get_proc() % acc_get_num_devices(acc_device_nvidia),
                     acc_device_nvidia);
  ops_device_initialised_externally = 1;
}
int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const real tol = 1.0e-5;

    int rank = 0;
    int size = 1;

    //Initialize MPI and determine rank and size
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    
    if ( size > MAX_MPI_SIZE )
    {
        if ( 0 == rank )
        {
            fprintf(stderr,"ERROR: Only up to %d MPI ranks are supported.\n",MAX_MPI_SIZE);
        }
        return -1;
    }
    
    dim2 size2d = size_to_2Dsize(size);
    int sizex = size2d.x;
    int sizey = size2d.y;
    assert(sizex*sizey == size);
    
    int rankx = rank%sizex;
    int ranky = rank/sizex;

    memset(A, 0, NY * NX * sizeof(real));
    memset(Aref, 0, NY * NX * sizeof(real));
    
    // set rhs
    for (int iy = 1; iy < NY-1; iy++)
    {
        for( int ix = 1; ix < NX-1; ix++ )
        {
            const real x = -1.0 + (2.0*ix/(NX-1));
            const real y = -1.0 + (2.0*iy/(NY-1));
            rhs[iy][ix] = expr(-10.0*(x*x + y*y));
        }
    }
    
#if _OPENACC
    acc_device_t device_type = acc_get_device_type();
    if ( acc_device_nvidia == device_type )
    {
        int ngpus=acc_get_num_devices(acc_device_nvidia);
        
        int devicenum=rank%ngpus;
        acc_set_device_num(devicenum,acc_device_nvidia);
    }
    // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems
    acc_init(device_type);
#endif /*_OPENACC*/

    // Ensure correctness if NX%sizex != 0
    int chunk_sizex = ceil( (1.0*NX)/sizex );

    int ix_start = rankx * chunk_sizex;
    int ix_end   = ix_start + chunk_sizex;

    // Do not process boundaries
    ix_start = max( ix_start, 1 );
    ix_end = min( ix_end, NX - 1 );

    // Ensure correctness if NY%sizey != 0
    int chunk_sizey = ceil( (1.0*NY)/sizey );

    int iy_start = ranky * chunk_sizey;
    int iy_end   = iy_start + chunk_sizey;

    // Do not process boundaries
    iy_start = max( iy_start, 1 );
    iy_end = min( iy_end, NY - 1 );

    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX);

    if ( rank == 0) printf("Calculate reference solution and time serial execution.\n");
    StartTimer();
    poisson2d_serial( rank, iter_max, tol );
    double runtime_serial = GetTimer();

    //Wait for all processes to ensure correct timing of the parallel version
    MPI_Barrier( MPI_COMM_WORLD );
    if ( rank == 0) printf("Parallel execution.\n");
    StartTimer();
    int iter  = 0;
    real error = 1.0;
    
    #pragma acc data copy(A) copyin(rhs) create(Anew,to_left,from_left,to_right,from_right)
    while ( error > tol && iter < iter_max )
    {
        error = 0.0;

        #pragma acc kernels
        for (int iy = iy_start; iy < iy_end; iy++)
        {
            for( int ix = ix_start; ix < ix_end; ix++ )
            {
                Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1]
                                                       + A[iy-1][ix] + A[iy+1][ix] ));
                error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix]));
            }
        }
        
        real globalerror = 0.0;
        MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD );
        error = globalerror;
        
        #pragma acc kernels
        for (int iy = iy_start; iy < iy_end; iy++)
        {
            for( int ix = ix_start; ix < ix_end; ix++ )
            {
                A[iy][ix] = Anew[iy][ix];
            }
        }

        //Periodic boundary conditions
        int topy    = (ranky == 0) ? (sizey-1) : ranky-1;
        int bottomy = (ranky == (sizey-1)) ? 0 : ranky+1;
        int top    = topy    * sizex + rankx;
        int bottom = bottomy * sizex + rankx;
        #pragma acc host_data use_device( A )
        {
            //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom
            MPI_Sendrecv( &A[iy_start][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top   , 0, &A[iy_end][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top
            MPI_Sendrecv( &A[(iy_end-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, &A[(iy_start-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top   , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        
        int leftx  = (rankx == 0) ? (sizex-1) : rankx-1;
        int rightx = (rankx == (sizex-1)) ? 0 : rankx+1;
        int left   = ranky * sizex + leftx;
        int right  = ranky * sizex + rightx;
        #pragma acc kernels
        for( int iy = iy_start; iy < iy_end; iy++ )
        {
                to_left[iy]  = A[iy][ix_start];
                to_right[iy] = A[iy][ix_end-1];
        }
        #pragma acc host_data use_device( to_left, from_left, to_right, from_right )
        {
            //1. Sent to_left starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_right from right 
            MPI_Sendrecv( to_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left   , 0, from_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent to_right starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_left from left
            MPI_Sendrecv( to_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right , 0, from_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left  , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        #pragma acc kernels
        for( int iy = iy_start; iy < iy_end; iy++ )
        {
                A[iy][ix_start-1] = from_left[iy];
                A[iy][ix_end]     = from_right[iy];
        }
        
        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }

    MPI_Barrier( MPI_COMM_WORLD );
    double runtime = GetTimer();

    if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol ) && rank == 0)
    {
        printf( "Num GPUs: %d with a (%d,%d) layout.\n", size, sizey,sizex );
        printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", NY,NX, runtime_serial/ 1000.0, size, runtime/ 1000.0, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
    }

    MPI_Finalize();
    return 0;
}
Example #10
0
int main(int argc, char ** argv) {

    acc_init(acc_device_cuda);
    acc_set_device_num(0, acc_device_cuda);
	// Keep track of the start time of the program
  	long long program_start_time = get_time();
	
	// Let the user specify the number of frames to process
	int num_frames = 1;
	
	if (argc != 3){
		fprintf(stderr, "usage: %s <input file> <num of frames>", argv[0]);
		exit(1);
	}
	
	num_frames = atoi(argv[2]);
	// Open video file
	char *video_file_name;
	video_file_name = argv[1];
	
	avi_t *cell_file = AVI_open_input_file(video_file_name, 1);
	if (cell_file == NULL)	{
		AVI_print_error("Error with AVI_open_input_file");
		return -1;
	}
	
	int i, j, *crow, *ccol, pair_counter = 0, x_result_len = 0, Iter = 20, ns = 4, k_count = 0, n;
	MAT *cellx, *celly, *A;
	double *GICOV_spots, *t, *G, *x_result, *y_result, *V, *QAX_CENTERS, *QAY_CENTERS;
	double threshold = 1.8, radius = 10.0, delta = 3.0, dt = 0.01, b = 5.0;
	
	// Extract a cropped version of the first frame from the video file
	MAT *image_chopped = get_frame(cell_file, 0, 1, 0);
	printf("Detecting cells in frame 0\n");
	
	// Get gradient matrices in x and y directions
	MAT *grad_x = gradient_x(image_chopped);
	MAT *grad_y = gradient_y(image_chopped);
	
	m_free(image_chopped);
	
	// Get GICOV matrix corresponding to image gradients

	long long GICOV_start_time = get_time();
	
	/*
	MAT *gicov = ellipsematching(grad_x, grad_y);
	
	// Square GICOV values
	MAT *max_gicov = m_get(gicov->m, gicov->n);
	for (i = 0; i < gicov->m; i++) {
		for (j = 0; j < gicov->n; j++) {
			double val = m_get_val(gicov, i, j);
			m_set_val(max_gicov, i, j, val * val);
		}
	}
	*/
	MAT *gicov = GICOV(grad_x, grad_y);		

	long long GICOV_end_time = get_time();
	
	// Dilate the GICOV matrix
	long long dilate_start_time = get_time();
	//MAT *strel = structuring_element_f(12);
	//MAT *img_dilated = dilate_f(gicov, strel);
	MAT *img_dilated = dilate(gicov);

	long long dilate_end_time = get_time();
	
	// Find possible matches for cell centers based on GICOV and record the rows/columns in which they are found
	pair_counter = 0;
	crow = (int *) malloc(gicov->m * gicov->n * sizeof(int));
	ccol = (int *) malloc(gicov->m * gicov->n * sizeof(int));
	for (i = 0; i < gicov->m; i++) {
		for (j = 0; j < gicov->n; j++) {
			if (!(m_get_val(gicov,i,j) == 0.0) && (m_get_val(img_dilated,i,j) == m_get_val(gicov,i,j))) {
				crow[pair_counter] = i;
				ccol[pair_counter] = j;
				pair_counter++;
			}
		}
	}

	
	GICOV_spots = (double *) malloc(sizeof(double)*pair_counter);
	for (i = 0; i < pair_counter; i++)
		GICOV_spots[i] = sqrt(m_get_val(gicov, crow[i], ccol[i]));
	
	G = (double *) calloc(pair_counter, sizeof(double));
	x_result = (double *) calloc(pair_counter, sizeof(double));
	y_result = (double *) calloc(pair_counter, sizeof(double));
	
	x_result_len = 0;
	for (i = 0; i < pair_counter; i++) {
		if ((crow[i] > 29) && (crow[i] < BOTTOM - TOP + 39)) {
			x_result[x_result_len] = ccol[i];
			y_result[x_result_len] = crow[i] - 40;
			G[x_result_len] = GICOV_spots[i];
			x_result_len++;
		}
	}
	
	// Make an array t which holds each "time step" for the possible cells
	t = (double *) malloc(sizeof(double) * 36);
	for (i = 0; i < 36; i++) {
		t[i] = (double)i * 2.0 * PI / 36.0;
	}
	
	// Store cell boundaries (as simple circles) for all cells
	cellx = m_get(x_result_len, 36);
	celly = m_get(x_result_len, 36);
	for(i = 0; i < x_result_len; i++) {
		for(j = 0; j < 36; j++) {
			m_set_val(cellx, i, j, x_result[i] + radius * cos(t[j]));
			m_set_val(celly, i, j, y_result[i] + radius * sin(t[j]));
		}
	}
	
	A = TMatrix(9,4);

	
	V = (double *) malloc(sizeof(double) * pair_counter);
	QAX_CENTERS = (double * )malloc(sizeof(double) * pair_counter);
	QAY_CENTERS = (double *) malloc(sizeof(double) * pair_counter);
	memset(V, 0, sizeof(double) * pair_counter);
	memset(QAX_CENTERS, 0, sizeof(double) * pair_counter);
	memset(QAY_CENTERS, 0, sizeof(double) * pair_counter);

	// For all possible results, find the ones that are feasibly leukocytes and store their centers
	k_count = 0;
	for (n = 0; n < x_result_len; n++) {
		if ((G[n] < -1 * threshold) || G[n] > threshold) {
			MAT * x, *y;
			VEC * x_row, * y_row;
			x = m_get(1, 36);
			y = m_get(1, 36);

			x_row = v_get(36);
			y_row = v_get(36);

			// Get current values of possible cells from cellx/celly matrices
			x_row = get_row(cellx, n, x_row);
			y_row = get_row(celly, n, y_row);
			uniformseg(x_row, y_row, x, y);

			// Make sure that the possible leukocytes are not too close to the edge of the frame
			if ((m_min(x) > b) && (m_min(y) > b) && (m_max(x) < cell_file->width - b) && (m_max(y) < cell_file->height - b)) {
				MAT * Cx, * Cy, *Cy_temp, * Ix1, * Iy1;
				VEC  *Xs, *Ys, *W, *Nx, *Ny, *X, *Y;
				Cx = m_get(1, 36);
				Cy = m_get(1, 36);
				Cx = mmtr_mlt(A, x, Cx);
				Cy = mmtr_mlt(A, y, Cy);
				
				Cy_temp = m_get(Cy->m, Cy->n);
				
				for (i = 0; i < 9; i++)
					m_set_val(Cy, i, 0, m_get_val(Cy, i, 0) + 40.0);
					
				// Iteratively refine the snake/spline
				for (i = 0; i < Iter; i++) {
					int typeofcell;
					
					if(G[n] > 0.0) typeofcell = 0;
					else typeofcell = 1;
					
					splineenergyform01(Cx, Cy, grad_x, grad_y, ns, delta, 2.0 * dt, typeofcell);
				}
				
				X = getsampling(Cx, ns);
				for (i = 0; i < Cy->m; i++)
					m_set_val(Cy_temp, i, 0, m_get_val(Cy, i, 0) - 40.0);
				Y = getsampling(Cy_temp, ns);
				
				Ix1 = linear_interp2(grad_x, X, Y);
				Iy1 = linear_interp2(grad_x, X, Y);
				Xs = getfdriv(Cx, ns);
				Ys = getfdriv(Cy, ns);
				
				Nx = v_get(Ys->dim);
				for (i = 0; i < Ys->dim; i++)
					v_set_val(Nx, i, v_get_val(Ys, i) / sqrt(v_get_val(Xs, i)*v_get_val(Xs, i) + v_get_val(Ys, i)*v_get_val(Ys, i)));
					
				Ny = v_get(Xs->dim);
				for (i = 0; i < Xs->dim; i++)
					v_set_val(Ny, i, -1.0 * v_get_val(Xs, i) / sqrt(v_get_val(Xs, i)*v_get_val(Xs, i) + v_get_val(Ys, i)*v_get_val(Ys, i)));
					
				W = v_get(Nx->dim);
				for (i = 0; i < Nx->dim; i++)
					v_set_val(W, i, m_get_val(Ix1, 0, i) * v_get_val(Nx, i) + m_get_val(Iy1, 0, i) * v_get_val(Ny, i));
					
				V[n] = mean(W) / std_dev(W);
				
				//get means of X and Y values for all "snaxels" of the spline contour, thus finding the cell centers
				QAX_CENTERS[k_count] = mean(X);
				QAY_CENTERS[k_count] = mean(Y) + TOP;
				
				k_count++;
				
				// Free memory
				v_free(W);
				v_free(Ny);
				v_free(Nx);
				v_free(Ys);
				v_free(Xs);
				m_free(Iy1);
				m_free(Ix1);
				v_free(Y);
				v_free(X);
				m_free(Cy_temp);
				m_free(Cy);
				m_free(Cx);				
			}
			
			// Free memory
			v_free(y_row);
			v_free(x_row);
			m_free(y);
			m_free(x);
		}
	}

	// Free memory
	free(V);
	free(ccol);
	free(crow);
	free(GICOV_spots);
	free(t);
	free(G);
	free(x_result);
	free(y_result);
	m_free(A);
	m_free(celly);
	m_free(cellx);
	m_free(img_dilated);
	m_free(gicov);
	m_free(grad_y);
	m_free(grad_x);
	
	// Report the total number of cells detected
	printf("Cells detected: %d\n\n", k_count);
	
	// Report the breakdown of the detection runtime
	printf("Detection runtime\n");
	printf("-----------------\n");
	printf("GICOV computation: %.5f seconds\n", ((float) (GICOV_end_time - GICOV_start_time)) / (1000*1000));
	printf("   GICOV dilation: %.5f seconds\n", ((float) (dilate_end_time - dilate_start_time)) / (1000*1000));
	printf("            Total: %.5f seconds\n", ((float) (get_time() - program_start_time)) / (1000*1000));
	
	// Now that the cells have been detected in the first frame,
	//  track the ellipses through subsequent frames
	if (num_frames > 1) printf("\nTracking cells across %d frames\n", num_frames);
	else                printf("\nTracking cells across 1 frame\n");
	long long tracking_start_time = get_time();
	int num_snaxels = 20;
	ellipsetrack(cell_file, QAX_CENTERS, QAY_CENTERS, k_count, radius, num_snaxels, num_frames);
	printf("           Total: %.5f seconds\n", ((float) (get_time() - tracking_start_time)) / (float) (1000*1000*num_frames));
	
	// Report total program execution time
    printf("\nTotal application run time: %.5f seconds\n", ((float) (get_time() - program_start_time)) / (1000*1000));

	return 0;
}
Example #11
0
/**
 * \brief Creates and initializes the working data for the plan
 * \param plan The Plan struct that holds the plan's data values.
 * \return Error flag value
 */
int initDOPENACCGEMMPlan(void *plan){   // <- Replace YOUR_NAME with the name of your module.
    if(!plan){
        return make_error(ALLOC, generic_err);           // <- This is the error code for one of the malloc fails.
    }
    Plan *p;
    DOPENACCGEMM_DATA *d;
    p = (Plan *)plan;

    #ifdef HAVE_PAPI
    int temp_event, i;
    int PAPI_Events [NUM_PAPI_EVENTS] = PAPI_COUNTERS;
    char *PAPI_units [NUM_PAPI_EVENTS] = PAPI_UNITS;
    #endif //HAVE_PAPI

    if(p){
        d = (DOPENACCGEMM_DATA *)p->vptr;
        p->exec_count = 0;           // Initialize the plan execution count to zero.
        perftimer_init(&p->timers, NUM_TIMERS);         // Initialize all performance timers to zero.

        #ifdef HAVE_PAPI
        /* Initialize plan's PAPI data */
        p->PAPI_EventSet = PAPI_NULL;
        p->PAPI_Num_Events = 0;

        TEST_PAPI(PAPI_create_eventset(&p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME);

        //Add the desired events to the Event Set; ensure the dsired counters
        //  are on the system then add, ignore otherwise
        for(i = 0; i < TOTAL_PAPI_EVENTS && i < NUM_PAPI_EVENTS; i++){
            temp_event = PAPI_Events[i];
            if(PAPI_query_event(temp_event) == PAPI_OK){
                p->PAPI_Num_Events++;
                TEST_PAPI(PAPI_add_event(p->PAPI_EventSet, temp_event), PAPI_OK, MyRank, 9999, PRINT_SOME);
            }
        }

        PAPIRes_init(p->PAPI_Results, p->PAPI_Times);
        PAPI_set_units(p->name, PAPI_units, NUM_PAPI_EVENTS);

        TEST_PAPI(PAPI_start(p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME);
        #endif     //HAVE_PAPI
    }
    if(d){
      int error;

      acc_device_t my_device = acc_get_device_type();
      acc_set_device_num(d->device_id, my_device);

      //When OpenACC can report back on accelerator size, these two lines should be enabled
      //d->device_memory = system_burn_accelerator_memory(d->device_id);
      //d->device_memory -= SUB_FACTOR;

      d->M = ((int)sqrt(d->device_memory/sizeof(double))) / 3;

      size_t page_size = sysconf(_SC_PAGESIZE);
      error = posix_memalign((void **)&(d->A_buffer),page_size,d->M*d->M*sizeof(double));
      assert(error==0);

      error = posix_memalign((void **)&(d->B_buffer),page_size,d->M*d->M*sizeof(double));
      assert(error==0);

      error = posix_memalign((void **)&(d->C_buffer),page_size,d->M*d->M*sizeof(double));
      assert(error==0);

      for(size_t idx=0; idx < d->M*d->M; idx++) {
        d->A_buffer[idx] = (double)4.5;
        d->B_buffer[idx] = (double)2.0;
        d->C_buffer[idx] = (double)0.0;
      }
    }
    return ERR_CLEAN;     // <- This indicates a clean run with no errors. Does not need to be changed.
} /* initDOPENACCGEMMPlan */