C++ (Cpp) launch_kernelの例

コード例 #1

0

ファイルを表示

ファイル: kickstart.c プロジェクト: hro424/arcos

/**
 * Main kickstart loader function.  Parses through all loader formats
 * to find a valid one.
 */
int
main(void)
{
    loader_format_t * fmt = NULL;
    L4_Word_t entry;
    L4_Word_t n;
    
    /* Try to find a valid loader format. */
    for (n = 0; loader_formats[n].probe; n++) {
	    if (loader_formats[n].probe ()) {
		    fmt = &loader_formats[n];
		    break;
	    }
    }

    if (fmt == NULL) {
	    printf ("No valid loader format found.");
	    return 0;
    }

    printf ("Detected %s\n", fmt->name);
    entry = fmt->init();

    /* Flush caches (some archs don't like code in their D-cache) */
    flush_cache();

    printf("Launching kernel ...\n");

    /* Start the kernel at its entry point */
    launch_kernel (entry);
    
    /* We're not supposed to return from the kernel. Signal if we do */
    FAIL();
    return 0;
}

コード例 #2

0

ファイルを表示

ファイル: cuda.c プロジェクト: anojavan/csinparallel

/* 
 * Advance the simulation by <n> generations by mapping the OpenGL pixel buffer
 * objects for writing from CUDA, executing the kernel <n> times, and unmapping
 * the pixel buffer object.
 */
void advance_generations(unsigned long n)
{
	uint8_t* device_bufs[2];
	size_t size;

	DEBUG2("Mapping CUDA resources and retrieving device buffer pointers\n");
	cudaGraphicsMapResources(2, cuda_graphics_resources, (cudaStream_t)0);

	cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[0], &size, 
								cuda_graphics_resources[0]);

	cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[1], &size, 
								cuda_graphics_resources[1]);

	check_cuda_error();

	while (n--) {

		DEBUG2("Launching kernel (grid.width = %u, grid.height = %u)\n",
				grid.width, grid.height);

		launch_kernel(device_bufs[grid.which_buf], device_bufs[!grid.which_buf], 
									grid.width, grid.height);

		grid.which_buf ^= 1;
	}

	DEBUG2("Unmapping CUDA resources\n");

	cudaGraphicsUnmapResources(2, cuda_graphics_resources, (cudaStream_t)0);
	cudaStreamSynchronize(0);
}

コード例 #3

0

ファイルを表示

ファイル: _SampleVisual.cpp プロジェクト: BclEx/GpuStructs

void RunCuda(struct cudaGraphicsResource **resource)
{
    // map OpenGL buffer object for writing from CUDA
    checkCudaErrors(cudaGraphicsMapResources(1, resource, 0), exit(0));
    float4 *devPtr;
    size_t size;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&devPtr, &size, *resource), exit(0));
    //printf("CUDA mapped VBO: May access %ld bytes\n", size);
    launch_kernel(devPtr, MeshWidth, MeshHeight, _anim);
    // unmap buffer object
    checkCudaErrors(cudaGraphicsUnmapResources(1, resource, 0), exit(0));
}

コード例 #4

0

ファイルを表示

ファイル: simplePBO.cpp プロジェクト: armerpaul/CUDA_RayTracer

// Run the Cuda part of the computation
void runCuda()
{
  uchar4 *dptr=NULL;
 
  // map OpenGL buffer object for writing from CUDA on a single GPU
  // no data is moved (Win & Linux). When mapped to CUDA, OpenGL
  // should not use this buffer
  cudaGLMapBufferObject((void**)&dptr, pbo);
 
  // execute the kernel
  launch_kernel(dptr, image_width, image_height, animTime);
 
  // unmap buffer object
  cudaGLUnmapBufferObject(pbo);
}

コード例 #5

0

ファイルを表示

ファイル: simpleGL.cpp プロジェクト: AnkurAnandapu/ocelot-fork

////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runCuda(GLuint vbo)
{
    // map OpenGL buffer object for writing from CUDA
    float4 *dptr;
    cutilSafeCall(cudaGLMapBufferObject((void**)&dptr, vbo));

    // execute the kernel
//    dim3 block(8, 8, 1);
//    dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
//    kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, anim);

    launch_kernel(dptr, mesh_width, mesh_height, anim);

    // unmap buffer object
    cutilSafeCall(cudaGLUnmapBufferObject(vbo));
}

コード例 #6

0

ファイルを表示

ファイル: launch_kernel.hpp プロジェクト: NextGenIntelligence/agency

__host__ __device__
void checked_launch_kernel(void* kernel, ::dim3 grid_dim, ::dim3 block_dim, int shared_memory_size, cudaStream_t stream, const Args&... args)
{
  // the error message we return depends on how the program was compiled
  const char* error_message = 
#if __cuda_lib_has_cudart
   // we have access to CUDART, so something went wrong during the kernel
#  ifndef __CUDA_ARCH__
   "cuda::detail::checked_launch_kernel(): CUDA error after cudaLaunch()"
#  else
   "cuda::detail::checked_launch_kernel(): CUDA error after cudaLaunchDevice()"
#  endif // __CUDA_ARCH__
#else // __cuda_lib_has_cudart
   // we don't have access to CUDART, so output a useful error message explaining why it's unsupported
#  ifndef __CUDA_ARCH__
   "cuda::detail::checked_launch_kernel(): CUDA kernel launch from host requires nvcc"
#  else
   "cuda::detail::checked_launch_kernel(): CUDA kernel launch from device requires arch=sm_35 or better and rdc=true"
#  endif // __CUDA_ARCH__
#endif
  ;

  throw_on_error(launch_kernel(kernel, grid_dim, block_dim, shared_memory_size, stream, args...), error_message);
}

コード例 #7

0

ファイルを表示

ファイル: runtime.cpp プロジェクト: AnyDSL/thorin

Continuation* Runtime::emit_host_code(CodeGen& code_gen, Platform platform, const std::string& ext, Continuation* continuation) {
    // to-target is the desired kernel call
    // target(mem, device, (dim.x, dim.y, dim.z), (block.x, block.y, block.z), body, return, free_vars)
    auto target = continuation->callee()->as_continuation();
    assert_unused(target->is_intrinsic());
    assert(continuation->num_args() >= LaunchArgs::Num && "required arguments are missing");

    // arguments
    auto target_device_id = code_gen.lookup(continuation->arg(LaunchArgs::Device));
    auto target_platform = builder_.getInt32(platform);
    auto target_device = builder_.CreateOr(target_platform, builder_.CreateShl(target_device_id, builder_.getInt32(4)));
    auto it_space = continuation->arg(LaunchArgs::Space)->as<Tuple>();
    auto it_config = continuation->arg(LaunchArgs::Config)->as<Tuple>();
    auto kernel = continuation->arg(LaunchArgs::Body)->as<Global>()->init()->as<Continuation>();

    auto kernel_name = builder_.CreateGlobalStringPtr(kernel->name().str());
    auto file_name = builder_.CreateGlobalStringPtr(continuation->world().name() + ext);
    const size_t num_kernel_args = continuation->num_args() - LaunchArgs::Num;

    // allocate argument pointers, sizes, and types
    llvm::Value* args   = code_gen.emit_alloca(llvm::ArrayType::get(builder_.getInt8PtrTy(), num_kernel_args), "args");
    llvm::Value* sizes  = code_gen.emit_alloca(llvm::ArrayType::get(builder_.getInt32Ty(),   num_kernel_args), "sizes");
    llvm::Value* aligns = code_gen.emit_alloca(llvm::ArrayType::get(builder_.getInt32Ty(),   num_kernel_args), "aligns");
    llvm::Value* types  = code_gen.emit_alloca(llvm::ArrayType::get(builder_.getInt8Ty(),    num_kernel_args), "types");

    // fill array of arguments
    for (size_t i = 0; i < num_kernel_args; ++i) {
        auto target_arg = continuation->arg(i + LaunchArgs::Num);
        const auto target_val = code_gen.lookup(target_arg);

        KernelArgType arg_type;
        llvm::Value*  void_ptr;
        if (target_arg->type()->isa<DefiniteArrayType>() ||
            target_arg->type()->isa<StructType>() ||
            target_arg->type()->isa<TupleType>()) {
            // definite array | struct | tuple
            auto alloca = code_gen.emit_alloca(target_val->getType(), target_arg->name().str());
            builder_.CreateStore(target_val, alloca);

            // check if argument type contains pointers
            if (!contains_ptrtype(target_arg->type()))
                WDEF(target_arg, "argument '{}' of aggregate type '{}' contains pointer (not supported in OpenCL 1.2)", target_arg, target_arg->type());

            void_ptr = builder_.CreatePointerCast(alloca, builder_.getInt8PtrTy());
            arg_type = KernelArgType::Struct;
        } else if (target_arg->type()->isa<PtrType>()) {
            auto ptr = target_arg->type()->as<PtrType>();
            auto rtype = ptr->pointee();

            if (!rtype->isa<ArrayType>())
                EDEF(target_arg, "currently only pointers to arrays supported as kernel argument; argument has different type: {}", ptr);

            auto alloca = code_gen.emit_alloca(builder_.getInt8PtrTy(), target_arg->name().str());
            auto target_ptr = builder_.CreatePointerCast(target_val, builder_.getInt8PtrTy());
            builder_.CreateStore(target_ptr, alloca);
            void_ptr = builder_.CreatePointerCast(alloca, builder_.getInt8PtrTy());
            arg_type = KernelArgType::Ptr;
        } else {
            // normal variable
            auto alloca = code_gen.emit_alloca(target_val->getType(), target_arg->name().str());
            builder_.CreateStore(target_val, alloca);

            void_ptr = builder_.CreatePointerCast(alloca, builder_.getInt8PtrTy());
            arg_type = KernelArgType::Val;
        }

        auto arg_ptr   = builder_.CreateInBoundsGEP(args,   llvm::ArrayRef<llvm::Value*>{builder_.getInt32(0), builder_.getInt32(i)});
        auto size_ptr  = builder_.CreateInBoundsGEP(sizes,  llvm::ArrayRef<llvm::Value*>{builder_.getInt32(0), builder_.getInt32(i)});
        auto align_ptr = builder_.CreateInBoundsGEP(aligns, llvm::ArrayRef<llvm::Value*>{builder_.getInt32(0), builder_.getInt32(i)});
        auto type_ptr  = builder_.CreateInBoundsGEP(types,  llvm::ArrayRef<llvm::Value*>{builder_.getInt32(0), builder_.getInt32(i)});

        auto size = layout_.getTypeStoreSize(target_val->getType());
        if (auto struct_type = llvm::dyn_cast<llvm::StructType>(target_val->getType())) {
            // In the case of a structure, do not include the padding at the end in the size
            auto last_elem   = struct_type->getStructNumElements() - 1;
            auto last_offset = layout_.getStructLayout(struct_type)->getElementOffset(last_elem);
            size = last_offset + layout_.getTypeStoreSize(struct_type->getStructElementType(last_elem));
        }

        builder_.CreateStore(void_ptr, arg_ptr);
        builder_.CreateStore(builder_.getInt32(size), size_ptr);
        builder_.CreateStore(builder_.getInt32(layout_.getABITypeAlignment(target_val->getType())), align_ptr);
        builder_.CreateStore(builder_.getInt8((uint8_t)arg_type), type_ptr);
    }

    // allocate arrays for the grid and block size
    const auto get_u32 = [&](const Def* def) { return builder_.CreateSExt(code_gen.lookup(def), builder_.getInt32Ty()); };

    llvm::Value* grid_array  = llvm::UndefValue::get(llvm::ArrayType::get(builder_.getInt32Ty(), 3));
    grid_array = builder_.CreateInsertValue(grid_array, get_u32(it_space->op(0)), 0);
    grid_array = builder_.CreateInsertValue(grid_array, get_u32(it_space->op(1)), 1);
    grid_array = builder_.CreateInsertValue(grid_array, get_u32(it_space->op(2)), 2);
    llvm::Value* grid_size = code_gen.emit_alloca(grid_array->getType(), "");
    builder_.CreateStore(grid_array, grid_size);

    llvm::Value* block_array = llvm::UndefValue::get(llvm::ArrayType::get(builder_.getInt32Ty(), 3));
    block_array = builder_.CreateInsertValue(block_array, get_u32(it_config->op(0)), 0);
    block_array = builder_.CreateInsertValue(block_array, get_u32(it_config->op(1)), 1);
    block_array = builder_.CreateInsertValue(block_array, get_u32(it_config->op(2)), 2);
    llvm::Value* block_size = code_gen.emit_alloca(block_array->getType(), "");
    builder_.CreateStore(block_array, block_size);

    std::vector<llvm::Value*> gep_first_elem{builder_.getInt32(0), builder_.getInt32(0)};
    grid_size  = builder_.CreateInBoundsGEP(grid_size,  gep_first_elem);
    block_size = builder_.CreateInBoundsGEP(block_size, gep_first_elem);
    args       = builder_.CreateInBoundsGEP(args,       gep_first_elem);
    sizes      = builder_.CreateInBoundsGEP(sizes,      gep_first_elem);
    aligns     = builder_.CreateInBoundsGEP(aligns,     gep_first_elem);
    types      = builder_.CreateInBoundsGEP(types,      gep_first_elem);

    launch_kernel(target_device,
                  file_name, kernel_name,
                  grid_size, block_size,
                  args, sizes, aligns, types,
                  builder_.getInt32(num_kernel_args));

    return continuation->arg(LaunchArgs::Return)->as_continuation();
}

コード例 #8

0

ファイルを表示

ファイル: oclGaussCrack.c プロジェクト: jsteube/oclGaussCrack

int main (int argc, char *argv[])
{
  uint64_t skip =  0;
  uint64_t left = -1;

  if (argc >= 2) skip = atoll (argv[1]);
  if (argc >= 3) left = atoll (argv[2]);

  printf ("Loading Kernel...\n");

  const char *filename = KERNEL_SRC;

  struct stat s;

  if (stat (filename, &s) == -1)
  {
    fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__);

    return (-1);
  }

  FILE *fp = fopen (filename, "rb");

  if (fp == NULL)
  {
    fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__);

    return (-1);
  }

  char *source_buf = (char *) malloc (s.st_size + 1);

  if (!fread (source_buf, sizeof (char), s.st_size, fp))
  {
    fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__);

    return (-1);
  }

  source_buf[s.st_size] = 0;

  fclose (fp);

  const char *sourceBuf[] = { source_buf };

  const size_t sourceLen[] = { s.st_size + 1 };

  printf ("Initializing OpenCL...\n");

  cl_platform_id platform;

  cl_uint num_devices = 0;

  cl_device_id devices[MAX_PLATFORM];

  gc_clGetPlatformIDs (1, &platform, NULL);

  gc_clGetDeviceIDs (platform, DEV_TYPE, MAX_PLATFORM, devices, &num_devices);

  gpu_ctx_t gpu_ctxs[MAX_GPU];

  memset (gpu_ctxs, 0, sizeof (gpu_ctxs));

  for (cl_uint device_id = 0; device_id < num_devices; device_id++)
  {
    cl_device_id device = devices[device_id];

    cl_context context = gc_clCreateContext (NULL, 1, &device, NULL, NULL);

    cl_program program = gc_clCreateProgramWithSource (context, 1, sourceBuf, sourceLen);

    gc_clBuildProgram (program, 1, &device, BUILD_OPTS, NULL, NULL);

    cl_kernel kernel = gc_clCreateKernel (program, KERNEL_NAME);

    cl_command_queue command_queue = gc_clCreateCommandQueue (context, device, 0);

    cl_uint max_compute_units;

    gc_clGetDeviceInfo (device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (max_compute_units), &max_compute_units, NULL);

    char device_name[BUFSIZ];

    memset (device_name, 0, sizeof (device_name));

    gc_clGetDeviceInfo (device, CL_DEVICE_NAME, sizeof (device_name), &device_name, NULL);

    printf ("Found new device #%2d: %s, %u compute units\n", device_id, device_name, max_compute_units);

    const int num_threads  = GPU_THREADS;
    const int num_elements = max_compute_units * num_threads * GPU_ACCEL;

    /**
     * GPU memory
     */

    const size_t size_block   = num_elements * sizeof (block_t);
    const size_t size_results = num_threads  * sizeof (uint32_t);

    cl_mem d_block = gc_clCreateBuffer (context, CL_MEM_READ_ONLY, size_block, NULL);

    cl_mem d_results = gc_clCreateBuffer (context, CL_MEM_WRITE_ONLY, size_results, NULL);

    gc_clSetKernelArg (kernel, 0, sizeof (cl_mem), (void *) &d_block);
    gc_clSetKernelArg (kernel, 1, sizeof (cl_mem), (void *) &d_results);

    /**
     * Host memory
     */

    block_t *h_block = (block_t *) malloc (size_block);

    uint32_t *h_results = (uint32_t *) malloc (size_results);

    memset (h_results, 0xff, size_results);

    gc_clEnqueueWriteBuffer (command_queue, d_results, CL_TRUE, 0, size_results, h_results, 0, NULL, NULL);

    /**
     * Buffers for candidates
     */

    uint8_t **plains_buf = (uint8_t **) calloc (num_elements * VECT_SIZE, sizeof (uint8_t *));

    for (int i = 0; i < num_elements * VECT_SIZE; i++)
    {
      /* Agreed, this is not nice. But who cares nowadays? */

      plains_buf[i] = (uint8_t *) malloc (MAX_LINELEN);
    }

    size_t *plains_len = (size_t *) calloc (num_elements * VECT_SIZE, sizeof (size_t));

    gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id];

    gpu_ctx->context           = context;
    gpu_ctx->program           = program;
    gpu_ctx->kernel            = kernel;
    gpu_ctx->command_queue     = command_queue;
    gpu_ctx->max_compute_units = max_compute_units;
    gpu_ctx->d_block           = d_block;
    gpu_ctx->d_results         = d_results;
    gpu_ctx->h_block           = h_block;
    gpu_ctx->h_results         = h_results;
    gpu_ctx->num_threads       = num_threads;
    gpu_ctx->num_elements      = num_elements;
    gpu_ctx->plains_buf        = plains_buf;
    gpu_ctx->plains_len        = plains_len;
  }

  /* static salt */

  const uint8_t salt_buf[16] =
  {
    0x97, 0x48, 0x6C, 0xAA,
    0x22, 0x5F, 0xE8, 0x77,
    0xC0, 0x35, 0xCC, 0x03,
    0x73, 0x23, 0x6D, 0x51
  };

  const size_t salt_len = sizeof (salt_buf);

  /* main loop */

  printf ("Initialization done, accepting candidates from stdin...\n\n");

  cl_uint cur_device_id = 0;

  while (!feof (stdin))
  {
    /* Get new password candidate from stdin */

    uint8_t line_buf[MAX_LINELEN];

    int cur_c = 0;

    int prev_c = 0;

    size_t line_len = 0;

    for (size_t i = 0; i < MAX_LINELEN - 100; i++) // - 100 = we need some space for salt and padding
    {
      cur_c = getchar ();

      if (cur_c == EOF) break;

      if ((prev_c == '\n') && (cur_c == '\0'))
      {
        line_len--;

        break;
      }

      line_buf[line_len] = cur_c;

      line_len++;

      prev_c = cur_c;
    }

    /* chop \r if it exists for some reason (in case user used a dictionary) */

    if (line_len >= 2)
    {
      if ((prev_c == '\r') && (cur_c == '\0')) line_len -= 2;
    }

    /* skip empty lines */

    if (line_len == 0) continue;

    /* The following enables distributed computing / resume work */

    if (skip)
    {
      skip--;

      continue;
    }

    if (left)
    {
      left--;
    }
    else
    {
      break;
    }

    /* Append constant salt */

    memcpy (line_buf + line_len, salt_buf, salt_len);

    line_len += salt_len;

    /* Generate digest out of it */

    uint32_t digest[4];

    md5_transform ((uint32_t *) line_buf, (uint32_t) line_len, digest);

    /* Next garanteed free GPU */

    gpu_ctx_t *gpu_ctx = &gpu_ctxs[cur_device_id];

    /* Save original buffer in case it cracks it */

    memcpy (gpu_ctx->plains_buf[gpu_ctx->num_cached], line_buf, line_len - salt_len);

    gpu_ctx->plains_len[gpu_ctx->num_cached] = line_len - salt_len;

    /* Next garanteed free memory element on that GPU */

    const uint32_t element_div = gpu_ctx->num_cached / 4;
    const uint32_t element_mod = gpu_ctx->num_cached % 4;

    /* Copy new digest */

    gpu_ctx->h_block[element_div].A[element_mod] = digest[0];
    gpu_ctx->h_block[element_div].B[element_mod] = digest[1];
    gpu_ctx->h_block[element_div].C[element_mod] = digest[2];
    gpu_ctx->h_block[element_div].D[element_mod] = digest[3];

    gpu_ctx->num_cached++;

    /* If memory elements on that GPU are full, switch to the next GPU */

    if ((gpu_ctx->num_cached / VECT_SIZE) < gpu_ctx->num_elements) continue;

    cur_device_id++;

    /* If there is no more GPU left, run the calculation */

    if (cur_device_id < num_devices) continue;

    /* Fire! */

    calc_work (num_devices, gpu_ctxs);

    launch_kernel (num_devices, gpu_ctxs);

    /* Collecting data has a blocking effect */

    check_results (num_devices, gpu_ctxs);

    /* Reset buffer state */

    for (cl_uint device_id = 0; device_id < num_devices; device_id++)
    {
      gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id];

      gpu_ctx->num_cached = 0;
    }

    cur_device_id = 0;
  }

  /* Final calculation of leftovers */

  calc_work (num_devices, gpu_ctxs);

  launch_kernel (num_devices, gpu_ctxs);

  check_results (num_devices, gpu_ctxs);

  return -1;
}

コード例 #9

0

ファイルを表示

ファイル: mpiStencilCudaGL.cpp プロジェクト: vhpvmx/crowds

void compute_process()
{
	int np, pid;
	MPI_Comm_rank(MPI_COMM_WORLD, &pid);
	MPI_Comm_size(MPI_COMM_WORLD, &np);
	int server_process = np - 1;
	MPI_Status status;

	int num_comp_nodes = np -1;
	unsigned int num_bytes = sizeof(sAgents);
	unsigned int num_halo_points = RADIO * world_width;
	unsigned int num_halo_bytes = num_halo_points * sizeof(int);
 
	size_t size_world = world_width * world_height * sizeof(int);
	int *h_world = (int *)malloc(size_world);
	int *d_world;

	int left_neighbor = (pid > 0) ? (pid - 1) : MPI_PROC_NULL;
	int right_neighbor = (pid < np -2) ? (pid + 1) : MPI_PROC_NULL;


	for(int j = 0; j < world_width * world_height; j++)
	{	
		h_world[j] = 0;
	}


	sAgents h_agents_in, h_agents_left_node, h_agents_right_node;
	float4 h_agents_pos[agents_total], h_agents_ids[agents_total];
	float4 *d_agents_pos, *d_agents_ids;
	unsigned int num_bytes_agents = agents_total * sizeof(float4);

	int world_height_node = world_height / num_comp_nodes;

	// Error code to check return values for CUDA calls
        cudaError_t err = cudaSuccess;

	// Allocate the device pointer
    	err = cudaMalloc((void **)&d_world, size_world);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

    	err = cudaMalloc((void **)&d_agents_pos, num_bytes_agents);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

    	err = cudaMalloc((void **)&d_agents_ids, num_bytes_agents);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }
		
	MPI_Recv(&h_agents_in, num_bytes, MPI_BYTE, server_process, 0, MPI_COMM_WORLD, &status);

	for(int i = 0; i < agents_total; i++)
	{
		//identify the active agents according to the y coordinate and set the busy cells in the world
		if(  ( round(h_agents_in.pos[i].y) >= (pid * world_height_node) ) and ( round(h_agents_in.pos[i].y) < ( (pid + 1) * world_height_node ) )  )
		{	
			h_agents_in.ids[i].y = 1;
			h_world[(int)round( (world_width * (h_agents_in.pos[i].y - 1) ) + h_agents_in.pos[i].x )] = h_agents_in.ids[i].x;
		}
		//Copy the data to a local arrays
		h_agents_pos[i] = h_agents_in.pos[i];
		h_agents_ids[i] = h_agents_in.ids[i];
	}


	err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
    	{
        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
    	}


	//for(int it = 0; it < nreps ; it++)
	while(1)
	{
		int it=4;
		err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes_agents, cudaMemcpyHostToDevice);

		if (err != cudaSuccess)
	    	{
	        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
	        	exit(EXIT_FAILURE);
	    	}
	
		err = cudaMemcpy(d_agents_ids, h_agents_ids, num_bytes_agents, cudaMemcpyHostToDevice);

		if (err != cudaSuccess)
	    	{
	        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
	        	exit(EXIT_FAILURE);
	    	}


		launch_kernel(d_agents_pos, d_agents_ids, d_world, world_width, world_height, agent_width, agent_height, world_height_node, pid );

		cudaMemcpy(h_agents_pos, d_agents_pos, num_bytes_agents, cudaMemcpyDeviceToHost);
		cudaMemcpy(h_agents_ids, d_agents_ids, num_bytes_agents, cudaMemcpyDeviceToHost);

		//copy the data to the struct
		for( int i = 0; i < agents_total; i++)
		{
			h_agents_in.pos[i] = h_agents_pos[i];
			h_agents_in.ids[i] = h_agents_ids[i];
		}


		MPI_Barrier(MPI_COMM_WORLD);
		MPI_Send(&h_agents_in, num_bytes, MPI_BYTE, server_process, DATA_COLLECT, MPI_COMM_WORLD);

		#ifdef DEBUG
		//printf("pid: %d\n", pid);
		//display_data(h_agents_in);
		#endif

		// send data to left, get data from right 
		MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, left_neighbor, it, &h_agents_right_node, num_bytes, MPI_BYTE, right_neighbor, it, MPI_COMM_WORLD, &status);

		// send data to right, get data from left 
		MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, right_neighbor, it, &h_agents_left_node, num_bytes, MPI_BYTE, left_neighbor, it, MPI_COMM_WORLD, &status);

		for( int i = 0; i < agents_total; i++)
		{
			if(pid != np-2)
			{
				if(h_agents_right_node.ids[i].y == 2)
				{
					h_agents_in.pos[i] = h_agents_right_node.pos[i];
					h_agents_pos[i] = h_agents_right_node.pos[i];
					h_agents_in.ids[i].y = 1;			
					h_agents_ids[i].y = 1;	
				}
			}
			if(pid != 0)				
			{
				if(h_agents_left_node.ids[i].y == 3)
				{
					h_agents_in.pos[i] = h_agents_left_node.pos[i];
					h_agents_pos[i] = h_agents_left_node.pos[i];
					h_agents_in.ids[i].y = 1;			
					h_agents_ids[i].y = 1;	
				}
			}
		}

/***
		if(pid == 1)
		{	
			printf("pid: %d\n", pid);
			display_data(h_agents_in);
			display_data(h_agents_right_node);
			display_data(h_agents_left_node);
		}
***/

	}


	/* Release resources */
//	free(h_agents_in); 
/*	
	free(h_output);
	cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary);
	cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo);
	cudaFree(d_input); cudaFree(d_output);
*/
}

コード例 #10

0

ファイルを表示

ファイル: mpiStencilCudaStructs.cpp プロジェクト: vhpvmx/crowds

void compute_process(int agents_total, int nreps, int world_width, int world_height)
{
	int np, pid;
	MPI_Comm_rank(MPI_COMM_WORLD, &pid);
	MPI_Comm_size(MPI_COMM_WORLD, &np);
	int server_process = np - 1;
	MPI_Status status;

	/* create a type for struct agent */
	const int nitems=5;
   	int blocklengths[5] = {1,1,1,1,1};
   	MPI_Datatype types[5] = {MPI_INT, MPI_INT, MPI_INT, MPI_FLOAT, MPI_FLOAT};
	MPI_Datatype mpi_agent_type;
	MPI_Aint offsets[5];

	offsets[0] = offsetof(agent, id);
    	offsets[1] = offsetof(agent, x);
    	offsets[2] = offsetof(agent, y);
    	offsets[3] = offsetof(agent, z);
    	offsets[4] = offsetof(agent, w);

	MPI_Type_create_struct(nitems, blocklengths, offsets, types, &mpi_agent_type);
	MPI_Type_commit(&mpi_agent_type);

	unsigned int num_bytes = agents_total * sizeof(float4);
	unsigned int num_halo_points = RADIO * world_width;
	unsigned int num_halo_bytes = num_halo_points * sizeof(short int);

	//unsigned int world_node_height = (world_height / (np-1)) + (RADIO * 2);
	//if(pid == 0 or pid == np - 2)
	//	world_node_height -= RADIO;
 
	size_t size_world = world_width * world_height * sizeof(short int);
	short int *h_world = (short int *)malloc(size_world);
	*h_world = 0;
	short int *d_world;

	for(int j = 0; j < world_width * world_height; j++)
	{	
		h_world[j] = 0;
	}

	/* alloc host memory */
	agent *h_agents_in = (agent *)malloc(num_bytes);
	//agent *d_agents_in;
	float4 *h_agents_pos;
	float4 *d_agents_pos;
	
	
	//MPI_Recv(rcv_address, num_points, MPI_FLOAT, server_process, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
	MPI_Recv(h_agents_in, agents_total, mpi_agent_type, server_process, 0, MPI_COMM_WORLD, &status);

	//Iniatialize world
	for( int i = 0; i < agents_total; i++)
	{
		h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] = (h_agents_in[i].x!=0?1:0);
		//if(h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] == 1)
			//printf("world x: %d, y: %d\n", h_agents_in[i].x, h_agents_in[i].y);	
		h_agents_pos[i].x = h_agents_in[i].x;
		h_agents_pos[i].y = h_agents_in[i].y;
		h_agents_pos[i].z = h_agents_in[i].z;
		h_agents_pos[i].w = h_agents_in[i].w;
	}

/***
	if(pid ==1)
{
	int k=0;
	for(int j = 0; j < world_width * world_height; j++)
	{	
		if ( j%96 == 0 and j>0)
		{
			k++;
			printf("%d row: %d\n", h_world[j], k);
		}
		else
			printf("%d ", h_world[j]);
	}
}
***/

	// Error code to check return values for CUDA calls
        cudaError_t err = cudaSuccess;

	// Allocate the device pointer
    	err = cudaMalloc((void **)&d_world, size_world);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

	err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
    	{
        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
    	}


	//http://cuda-programming.blogspot.com.es/2013/02/cuda-array-in-cuda-how-to-use-cuda.html
	//http://stackoverflow.com/questions/17924705/structure-of-arrays-vs-array-of-structures-in-cuda
	// Allocate the device pointer

    	err = cudaMalloc((void **)&d_agents_pos, num_bytes);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

	err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
    	{
        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
    	}


	launch_kernel(d_agents_pos, d_world, world_width, world_height );

	MPI_Barrier( MPI_COMM_WORLD);

#ifdef DEBUG
//	printf("pid: %d\n", pid);
//	display_data(h_agents_in, agents_total );
#endif

	MPI_Send(h_agents_in, agents_total, mpi_agent_type, server_process, DATA_COLLECT, MPI_COMM_WORLD);


	/* Release resources */
	free(h_agents_in); 
/*	
	free(h_output);
	cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary);
	cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo);
	cudaFree(d_input); cudaFree(d_output);
*/
}