C++ (Cpp) acc_get_num_devicesの例

コード例 #1

0

ファイルを表示

ファイル: osu_coll.c プロジェクト: forresti/osu-micro-benchmarks

int
init_accel (void)
{
#if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_)
     char * str;
     int local_rank, dev_count;
     int dev_id = 0;
#endif
#ifdef _ENABLE_CUDA_
    CUresult curesult = CUDA_SUCCESS;
    CUdevice cuDevice;
#endif

    switch (options.accel) {
#ifdef _ENABLE_CUDA_
        case managed:
        case cuda:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                cudaGetDeviceCount(&dev_count);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            curesult = cuInit(0);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuDeviceGet(&cuDevice, dev_id);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
        
            curesult = cuCtxCreate(&cuContext, 0, cuDevice);
            if (curesult != CUDA_SUCCESS) {
                return 1;
            }
            break;
#endif   
#ifdef _ENABLE_OPENACC_
        case openacc:
            if ((str = getenv("LOCAL_RANK")) != NULL) {
                dev_count = acc_get_num_devices(acc_device_not_host);
                local_rank = atoi(str);
                dev_id = local_rank % dev_count;
            }
        
            acc_set_device_num (dev_id, acc_device_not_host);
            break;
#endif   
        default:
            fprintf(stderr, "Invalid device type, should be cuda or openacc\n");
            return 1;
    }

    return 0;
}

コード例 #2

0

ファイルを表示

ファイル: runtime.c プロジェクト: saifthe1/OpenACC-to-OpenCL-Compiler

void acc_shutdown(acc_device_t dev) {
  acc_init_once();

  size_t num_devices = acc_get_num_devices(dev);

  assert(num_devices > 0);
  unsigned i;
  for (i = 0; i < num_devices; i++)
    acc_shutdown_(dev, i);
}

コード例 #3

0

ファイルを表示

ファイル: lib-10.c プロジェクト: jroelofs/darwin-gcc-5

int
main (int argc, char **argv)
{
    void *d;
    acc_device_t devtype = acc_device_host;

#if ACC_DEVICE_TYPE_nvidia
    devtype = acc_device_nvidia;

    if (acc_get_num_devices (acc_device_nvidia) == 0)
        return 0;
#endif

    acc_init (devtype);

    d = acc_malloc (0);
    if (d != NULL)
        abort ();

    acc_free (0);

    acc_shutdown (devtype);

    acc_set_device_type (devtype);

    d = acc_malloc (0);
    if (d != NULL)
        abort ();

    acc_shutdown (devtype);

    acc_init (devtype);

    d = acc_malloc (1024);
    if (d == NULL)
        abort ();

    acc_free (d);

    acc_shutdown (devtype);

    acc_set_device_type (devtype);

    d = acc_malloc (1024);
    if (d == NULL)
        abort ();

    acc_free (d);

    acc_shutdown (devtype);

    return 0;
}

コード例 #4

0

ファイルを表示

ファイル: lib-1.c プロジェクト: abumaryam/gcc

int
main (int argc, char **argv)
{
  acc_device_t devtype = acc_device_host;

#if ACC_DEVICE_TYPE_nvidia
  devtype = acc_device_nvidia;

  if (acc_get_num_devices (devtype) == 0)
    return 0;
#endif

  acc_init (devtype);

  acc_init (devtype);

  return 0;
}

コード例 #5

0

ファイルを表示

ファイル: lib-2.c プロジェクト: jtramm/gcc

int
main (int argc, char **argv)
{
  acc_device_t devtype = acc_device_host;

#if ACC_DEVICE_TYPE_nvidia
  devtype = acc_device_nvidia;

  if (acc_get_num_devices (acc_device_nvidia) == 0)
    return 0;
#endif

  acc_init (devtype);

  acc_shutdown (devtype);

  fprintf (stderr, "CheCKpOInT\n");
  acc_shutdown (devtype);

  return 0;
}

コード例 #6

0

ファイルを表示

ファイル: lab8-1-solution.c プロジェクト: cosden/GPGPU

static void initGPU(int argc, char** argv) {
        // gets the device id (if specified) to run on
        int devId = -1;
        if (argc > 1) {
                devId = atoi(argv[1]);
                int devCount = acc_get_num_devices(acc_device_nvidia);
                if (devId < 0 || devId >= devCount) {
                        printf("The specified device ID is not supported.\n");
                        exit(1);
                }
        }
        if (devId != -1) {
                acc_set_device_num(devId, acc_device_nvidia);
        }
        // creates a context on the GPU just to
        // exclude initialization time from computations
        acc_init(acc_device_nvidia);

        // print device id
        devId = acc_get_device_num(acc_device_nvidia);
        printf("Running on GPU with ID %d.\n\n", devId);

}

コード例 #7

0

ファイルを表示

ファイル: laplace2d.c プロジェクト: Hopobcn/nvidia-openacc-course-sources

int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const float pi  = 2.0 * asinf(1.0f);
    const float tol = 1.0e-5f;

    int rank = 0;
    int size = 1;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    memset(A, 0, N * M * sizeof(float));
    memset(Aref, 0, N * M * sizeof(float));
    
    // set boundary conditions
    for (int j = 0; j < N; j++)
    {
        float y0     = sinf( 2.0 * pi * j / (N-1));
        A[j][0]      = y0;
        A[j][M-1]    = y0;
        Aref[j][0]   = y0;
        Aref[j][M-1] = y0;
    }
    
#if _OPENACC
    int ngpus=acc_get_num_devices(acc_device_nvidia);
    int devicenum=rank%ngpus;
    acc_set_device_num(devicenum,acc_device_nvidia);

    // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems
    acc_init(acc_device_nvidia);
#endif /*_OPENACC*/

    // Ensure correctness if N%size != 0
    int chunk_size = ceil( (1.0*N)/size );
    
    int jstart = rank * chunk_size;
    int jend   = jstart + chunk_size;
    
    // Do not process boundaries
    jstart = max( jstart, 1 );
    jend = min( jend, N - 1 );
    
    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M);

    if ( rank == 0) printf("Calculate reference solution and time serial execution.\n");
    StartTimer();
    laplace2d_serial( rank, iter_max, tol );
    double runtime_serial = GetTimer();

    //Wait for all processes to ensure correct timing of the parallel version
    MPI_Barrier( MPI_COMM_WORLD );
    if ( rank == 0) printf("Parallel execution.\n");
    StartTimer();
    int iter  = 0;
    float error = 1.0f;
    
    #pragma acc data copy(A) create(Anew)
    while ( error > tol && iter < iter_max )
    {
        error = 0.f;

        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1]
                                     + A[j-1][i] + A[j+1][i]);
                error = fmaxf( error, fabsf(Anew[j][i]-A[j][i]));
            }
        }
        float globalerror = 0.0f;
        MPI_Allreduce( &error, &globalerror, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD );
        error = globalerror;
        
        //TODO: Split into halo and bulk part 
        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                A[j][i] = Anew[j][i];
            }
        }
        //TODO: Start bulk part asynchronously

        //Periodic boundary conditions
        int top    = (rank == 0) ? (size-1) : rank-1;
        int bottom = (rank == (size-1)) ? 0 : rank+1;

        #pragma acc host_data use_device( A )
        {
            //1. Sent row jstart (first modified row) to top receive lower boundary (jend) from bottom
            MPI_Sendrecv( A[jstart], M, MPI_FLOAT, top   , 0, A[jend], M, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent row (jend-1) (last modified row) to bottom receive upper boundary (jstart-1) from top
            MPI_Sendrecv( A[(jend-1)], M, MPI_FLOAT, bottom, 0, A[(jstart-1)], M, MPI_FLOAT, top   , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        
        //TODO: wait for bulk part
        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }
    MPI_Barrier( MPI_COMM_WORLD );
    double runtime = GetTimer();

    if (check_results( rank, jstart, jend, tol ) && rank == 0)
    {
        printf( "Num GPUs: %d\n", size );
        printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
    }
    MPI_Finalize();
    return 0;
}

コード例 #8

0

ファイルを表示

ファイル: laplace2d.c プロジェクト: Hopobcn/nvidia-openacc-course-sources

int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const float pi  = 2.0 * asinf(1.0f);
    const float tol = 1.0e-5f;

    int rank = 0;
    int size = 1;

    //TODO: Initialize MPI and determine rank and size
    //int MPI_Init(int *argc, char ***argv);
    //int MPI_Comm_rank(MPI_COMM_WORLD, int *rank);
    //int MPI_Comm_size(MPI_COMM_WORLD, int *size)

    memset(A, 0, N * M * sizeof(float));
    memset(Aref, 0, N * M * sizeof(float));
    
    // set boundary conditions
    for (int j = 0; j < N; j++)
    {
        float y0     = sinf( 2.0 * pi * j / (N-1));
        A[j][0]      = y0;
        A[j][M-1]    = y0;
        Aref[j][0]   = y0;
        Aref[j][M-1] = y0;
    }
    
#if _OPENACC
    int ngpus=acc_get_num_devices(acc_device_nvidia);
    //TODO: choose device to use by this rank
    int devicenum=0;
    acc_set_device_num(devicenum,acc_device_nvidia);

    // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems
    acc_init(acc_device_nvidia);
#endif /*_OPENACC*/

    int jstart = 1;
    int jend   = N-1;

    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M);

    if ( rank == 0) printf("Calculate reference solution and time serial execution.\n");
    StartTimer();
    laplace2d_serial( rank, iter_max, tol );
    double runtime_serial = GetTimer();

    //TODO: Wait for all processes to ensure correct timing of the parallel version
    //int MPI_Barrier( MPI_COMM_WORLD );
    if ( rank == 0) printf("Parallel execution.\n");
    StartTimer();
    int iter  = 0;
    float error = 1.0f;
    
    #pragma acc data copy(A) create(Anew)
    while ( error > tol && iter < iter_max )
    {
        error = 0.f;

        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1]
                                     + A[j-1][i] + A[j+1][i]);
                error = fmaxf( error, fabsf(Anew[j][i]-A[j][i]));
            }
        }
        
        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                A[j][i] = Anew[j][i];
            }
        }

        //Periodic boundary conditions
        #pragma acc kernels
        for( int i = 1; i < M-1; i++ )
        {
                A[0][i]     = A[(N-2)][i];
                A[(N-1)][i] = A[1][i];
        }
        
        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }
    //TODO: Wait for all processes to ensure correct timing of the parallel version
    //int MPI_Barrier( MPI_COMM_WORLD );
    double runtime = GetTimer();

    if (check_results( rank, jstart, jend, tol ) && rank == 0)
    {
        printf( "Num GPUs: %d\n", size );
        printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
    }
    //TODO: Finalize MPI
    //int MPI_Finalize();
    return 0;
}

コード例 #9

0

ファイルを表示

ファイル: Application.cpp プロジェクト: orlandoacevedo/MCGPU

int metrosim::run(int argc, char** argv) {
	SimulationArgs args = SimulationArgs();

	if (!getCommands(argc, argv, &args)) {
		exit(EXIT_FAILURE);
	}

#ifdef _OPENACC
	int numDevices = acc_get_num_devices(acc_device_nvidia);

	// Ensure args.simulationMode is either Serial or Parallel
	if (args.simulationMode == SimulationMode::Default) {
		if (numDevices < 1) {
			fprintf(stdout, "No GPU devices found; defaulting to CPU "
                            "execution.\n");
			args.simulationMode = SimulationMode::Serial;
		} else {
			fprintf(stdout, "%d GPU device(s) found; running on GPU.\n",
                    numDevices);
      // FIXME (blm)
      fprintf(stdout, "WARNING: Not all features support GPU offloading at "
                      "this time. Results may be inaccurate.\n");
			args.simulationMode = SimulationMode::Parallel;
		}
	}

    if (args.simulationMode == SimulationMode::Parallel) {
        if (numDevices == 0) {
            fprintf(stdout, "ERROR: Cannot find suitable GPU!\n");
            exit(EXIT_FAILURE);
        }

        // Implicitly sets the device
        acc_init(acc_device_nvidia);
    }

#else
    // Without OpenACC, only serial calculations are supported
    if (args.simulationMode == SimulationMode::Parallel) {
        fprintf(stdout, "Must compile with OpenACC capability to run in "
                "parallel mode.\n");
        exit(EXIT_FAILURE);
    } else {
        args.simulationMode = SimulationMode::Serial;
		fprintf(stdout, "Beginning simulation using CPU...\n");
    }
#endif

	Simulation sim = Simulation(args);
	sim.run();

	fprintf(stdout, "Finishing simulation...\n\n");

// Shutdown the device if we were using the GPU
#ifdef _OPENACC
	if (args.simulationMode == SimulationMode::Parallel) {
		acc_shutdown(acc_device_nvidia);
	}
#endif

	exit(EXIT_SUCCESS);
}

コード例 #10

0

ファイルを表示

ファイル: lib-88.c プロジェクト: 0day-ci/gcc

int
main (int argc, char **argv)
{
  const int nthreads = 1;
  int i;
  pthread_attr_t attr;
  pthread_t *tid;

  if (acc_get_num_devices (acc_device_nvidia) == 0)
    return 0;

  acc_init (acc_device_nvidia);

  x = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      x[i] = i;
    }

  d_x = acc_copyin (x, N);

  if (acc_is_present (x, N) != 1)
    abort ();

  if (pthread_attr_init (&attr) != 0)
    perror ("pthread_attr_init failed");

  tid = (pthread_t *) malloc (nthreads * sizeof (pthread_t));

  for (i = 0; i < nthreads; i++)
    {
      if (pthread_create (&tid[i], &attr, &test, (void *) (unsigned long) (i))
	  != 0)
	perror ("pthread_create failed");
    }

  if (pthread_attr_destroy (&attr) != 0)
    perror ("pthread_attr_destroy failed");

  for (i = 0; i < nthreads; i++)
    {
      void *res;

      if (pthread_join (tid[i], &res) != 0)
	perror ("pthread join failed");
    }

  if (acc_is_present (x, N) != 1)
    abort ();

  memset (x, 0, N);

  acc_copyout (x, N);

  for (i = 0; i < N; i++)
    {
      if (x[i] != N - i - 1)
	abort ();
    }

  if (acc_is_present (x, N) != 0)
    abort ();

  acc_shutdown (acc_device_nvidia);

  return 0;
}

コード例 #11

0

ファイルを表示

ファイル: multidim_kernels.cpp プロジェクト: gihanmudalige/OPS

void ops_init_backend() {
  acc_set_device_num(ops_get_proc() % acc_get_num_devices(acc_device_nvidia),
                     acc_device_nvidia);
  ops_device_initialised_externally = 1;
}

コード例 #12

0

ファイルを表示

ファイル: poisson2d.solution.c プロジェクト: NVIDIA-OpenACC-Course/nvidia-advanced-openacc-course-sources

int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const real tol = 1.0e-5;

    int rank = 0;
    int size = 1;

    //Initialize MPI and determine rank and size
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    
    if ( size > MAX_MPI_SIZE )
    {
        if ( 0 == rank )
        {
            fprintf(stderr,"ERROR: Only up to %d MPI ranks are supported.\n",MAX_MPI_SIZE);
        }
        return -1;
    }
    
    dim2 size2d = size_to_2Dsize(size);
    int sizex = size2d.x;
    int sizey = size2d.y;
    assert(sizex*sizey == size);
    
    int rankx = rank%sizex;
    int ranky = rank/sizex;

    memset(A, 0, NY * NX * sizeof(real));
    memset(Aref, 0, NY * NX * sizeof(real));
    
    // set rhs
    for (int iy = 1; iy < NY-1; iy++)
    {
        for( int ix = 1; ix < NX-1; ix++ )
        {
            const real x = -1.0 + (2.0*ix/(NX-1));
            const real y = -1.0 + (2.0*iy/(NY-1));
            rhs[iy][ix] = expr(-10.0*(x*x + y*y));
        }
    }
    
#if _OPENACC
    acc_device_t device_type = acc_get_device_type();
    if ( acc_device_nvidia == device_type )
    {
        int ngpus=acc_get_num_devices(acc_device_nvidia);
        
        int devicenum=rank%ngpus;
        acc_set_device_num(devicenum,acc_device_nvidia);
    }
    // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems
    acc_init(device_type);
#endif /*_OPENACC*/

    // Ensure correctness if NX%sizex != 0
    int chunk_sizex = ceil( (1.0*NX)/sizex );

    int ix_start = rankx * chunk_sizex;
    int ix_end   = ix_start + chunk_sizex;

    // Do not process boundaries
    ix_start = max( ix_start, 1 );
    ix_end = min( ix_end, NX - 1 );

    // Ensure correctness if NY%sizey != 0
    int chunk_sizey = ceil( (1.0*NY)/sizey );

    int iy_start = ranky * chunk_sizey;
    int iy_end   = iy_start + chunk_sizey;

    // Do not process boundaries
    iy_start = max( iy_start, 1 );
    iy_end = min( iy_end, NY - 1 );

    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX);

    if ( rank == 0) printf("Calculate reference solution and time serial execution.\n");
    StartTimer();
    poisson2d_serial( rank, iter_max, tol );
    double runtime_serial = GetTimer();

    //Wait for all processes to ensure correct timing of the parallel version
    MPI_Barrier( MPI_COMM_WORLD );
    if ( rank == 0) printf("Parallel execution.\n");
    StartTimer();
    int iter  = 0;
    real error = 1.0;
    
    #pragma acc data copy(A) copyin(rhs) create(Anew,to_left,from_left,to_right,from_right)
    while ( error > tol && iter < iter_max )
    {
        error = 0.0;

        #pragma acc kernels
        for (int iy = iy_start; iy < iy_end; iy++)
        {
            for( int ix = ix_start; ix < ix_end; ix++ )
            {
                Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1]
                                                       + A[iy-1][ix] + A[iy+1][ix] ));
                error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix]));
            }
        }
        
        real globalerror = 0.0;
        MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD );
        error = globalerror;
        
        #pragma acc kernels
        for (int iy = iy_start; iy < iy_end; iy++)
        {
            for( int ix = ix_start; ix < ix_end; ix++ )
            {
                A[iy][ix] = Anew[iy][ix];
            }
        }

        //Periodic boundary conditions
        int topy    = (ranky == 0) ? (sizey-1) : ranky-1;
        int bottomy = (ranky == (sizey-1)) ? 0 : ranky+1;
        int top    = topy    * sizex + rankx;
        int bottom = bottomy * sizex + rankx;
        #pragma acc host_data use_device( A )
        {
            //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom
            MPI_Sendrecv( &A[iy_start][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top   , 0, &A[iy_end][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top
            MPI_Sendrecv( &A[(iy_end-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, &A[(iy_start-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top   , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        
        int leftx  = (rankx == 0) ? (sizex-1) : rankx-1;
        int rightx = (rankx == (sizex-1)) ? 0 : rankx+1;
        int left   = ranky * sizex + leftx;
        int right  = ranky * sizex + rightx;
        #pragma acc kernels
        for( int iy = iy_start; iy < iy_end; iy++ )
        {
                to_left[iy]  = A[iy][ix_start];
                to_right[iy] = A[iy][ix_end-1];
        }
        #pragma acc host_data use_device( to_left, from_left, to_right, from_right )
        {
            //1. Sent to_left starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_right from right 
            MPI_Sendrecv( to_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left   , 0, from_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent to_right starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_left from left
            MPI_Sendrecv( to_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right , 0, from_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left  , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        #pragma acc kernels
        for( int iy = iy_start; iy < iy_end; iy++ )
        {
                A[iy][ix_start-1] = from_left[iy];
                A[iy][ix_end]     = from_right[iy];
        }
        
        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }

    MPI_Barrier( MPI_COMM_WORLD );
    double runtime = GetTimer();

    if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol ) && rank == 0)
    {
        printf( "Num GPUs: %d with a (%d,%d) layout.\n", size, sizey,sizex );
        printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", NY,NX, runtime_serial/ 1000.0, size, runtime/ 1000.0, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
    }

    MPI_Finalize();
    return 0;
}