/** * Main kickstart loader function. Parses through all loader formats * to find a valid one. */ int main(void) { loader_format_t * fmt = NULL; L4_Word_t entry; L4_Word_t n; /* Try to find a valid loader format. */ for (n = 0; loader_formats[n].probe; n++) { if (loader_formats[n].probe ()) { fmt = &loader_formats[n]; break; } } if (fmt == NULL) { printf ("No valid loader format found."); return 0; } printf ("Detected %s\n", fmt->name); entry = fmt->init(); /* Flush caches (some archs don't like code in their D-cache) */ flush_cache(); printf("Launching kernel ...\n"); /* Start the kernel at its entry point */ launch_kernel (entry); /* We're not supposed to return from the kernel. Signal if we do */ FAIL(); return 0; }
/* * Advance the simulation by <n> generations by mapping the OpenGL pixel buffer * objects for writing from CUDA, executing the kernel <n> times, and unmapping * the pixel buffer object. */ void advance_generations(unsigned long n) { uint8_t* device_bufs[2]; size_t size; DEBUG2("Mapping CUDA resources and retrieving device buffer pointers\n"); cudaGraphicsMapResources(2, cuda_graphics_resources, (cudaStream_t)0); cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[0], &size, cuda_graphics_resources[0]); cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[1], &size, cuda_graphics_resources[1]); check_cuda_error(); while (n--) { DEBUG2("Launching kernel (grid.width = %u, grid.height = %u)\n", grid.width, grid.height); launch_kernel(device_bufs[grid.which_buf], device_bufs[!grid.which_buf], grid.width, grid.height); grid.which_buf ^= 1; } DEBUG2("Unmapping CUDA resources\n"); cudaGraphicsUnmapResources(2, cuda_graphics_resources, (cudaStream_t)0); cudaStreamSynchronize(0); }
void RunCuda(struct cudaGraphicsResource **resource) { // map OpenGL buffer object for writing from CUDA checkCudaErrors(cudaGraphicsMapResources(1, resource, 0), exit(0)); float4 *devPtr; size_t size; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&devPtr, &size, *resource), exit(0)); //printf("CUDA mapped VBO: May access %ld bytes\n", size); launch_kernel(devPtr, MeshWidth, MeshHeight, _anim); // unmap buffer object checkCudaErrors(cudaGraphicsUnmapResources(1, resource, 0), exit(0)); }
// Run the Cuda part of the computation void runCuda() { uchar4 *dptr=NULL; // map OpenGL buffer object for writing from CUDA on a single GPU // no data is moved (Win & Linux). When mapped to CUDA, OpenGL // should not use this buffer cudaGLMapBufferObject((void**)&dptr, pbo); // execute the kernel launch_kernel(dptr, image_width, image_height, animTime); // unmap buffer object cudaGLUnmapBufferObject(pbo); }
//////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// void runCuda(GLuint vbo) { // map OpenGL buffer object for writing from CUDA float4 *dptr; cutilSafeCall(cudaGLMapBufferObject((void**)&dptr, vbo)); // execute the kernel // dim3 block(8, 8, 1); // dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); // kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, anim); launch_kernel(dptr, mesh_width, mesh_height, anim); // unmap buffer object cutilSafeCall(cudaGLUnmapBufferObject(vbo)); }
__host__ __device__ void checked_launch_kernel(void* kernel, ::dim3 grid_dim, ::dim3 block_dim, int shared_memory_size, cudaStream_t stream, const Args&... args) { // the error message we return depends on how the program was compiled const char* error_message = #if __cuda_lib_has_cudart // we have access to CUDART, so something went wrong during the kernel # ifndef __CUDA_ARCH__ "cuda::detail::checked_launch_kernel(): CUDA error after cudaLaunch()" # else "cuda::detail::checked_launch_kernel(): CUDA error after cudaLaunchDevice()" # endif // __CUDA_ARCH__ #else // __cuda_lib_has_cudart // we don't have access to CUDART, so output a useful error message explaining why it's unsupported # ifndef __CUDA_ARCH__ "cuda::detail::checked_launch_kernel(): CUDA kernel launch from host requires nvcc" # else "cuda::detail::checked_launch_kernel(): CUDA kernel launch from device requires arch=sm_35 or better and rdc=true" # endif // __CUDA_ARCH__ #endif ; throw_on_error(launch_kernel(kernel, grid_dim, block_dim, shared_memory_size, stream, args...), error_message); }
Continuation* Runtime::emit_host_code(CodeGen& code_gen, Platform platform, const std::string& ext, Continuation* continuation) { // to-target is the desired kernel call // target(mem, device, (dim.x, dim.y, dim.z), (block.x, block.y, block.z), body, return, free_vars) auto target = continuation->callee()->as_continuation(); assert_unused(target->is_intrinsic()); assert(continuation->num_args() >= LaunchArgs::Num && "required arguments are missing"); // arguments auto target_device_id = code_gen.lookup(continuation->arg(LaunchArgs::Device)); auto target_platform = builder_.getInt32(platform); auto target_device = builder_.CreateOr(target_platform, builder_.CreateShl(target_device_id, builder_.getInt32(4))); auto it_space = continuation->arg(LaunchArgs::Space)->as<Tuple>(); auto it_config = continuation->arg(LaunchArgs::Config)->as<Tuple>(); auto kernel = continuation->arg(LaunchArgs::Body)->as<Global>()->init()->as<Continuation>(); auto kernel_name = builder_.CreateGlobalStringPtr(kernel->name().str()); auto file_name = builder_.CreateGlobalStringPtr(continuation->world().name() + ext); const size_t num_kernel_args = continuation->num_args() - LaunchArgs::Num; // allocate argument pointers, sizes, and types llvm::Value* args = code_gen.emit_alloca(llvm::ArrayType::get(builder_.getInt8PtrTy(), num_kernel_args), "args"); llvm::Value* sizes = code_gen.emit_alloca(llvm::ArrayType::get(builder_.getInt32Ty(), num_kernel_args), "sizes"); llvm::Value* aligns = code_gen.emit_alloca(llvm::ArrayType::get(builder_.getInt32Ty(), num_kernel_args), "aligns"); llvm::Value* types = code_gen.emit_alloca(llvm::ArrayType::get(builder_.getInt8Ty(), num_kernel_args), "types"); // fill array of arguments for (size_t i = 0; i < num_kernel_args; ++i) { auto target_arg = continuation->arg(i + LaunchArgs::Num); const auto target_val = code_gen.lookup(target_arg); KernelArgType arg_type; llvm::Value* void_ptr; if (target_arg->type()->isa<DefiniteArrayType>() || target_arg->type()->isa<StructType>() || target_arg->type()->isa<TupleType>()) { // definite array | struct | tuple auto alloca = code_gen.emit_alloca(target_val->getType(), target_arg->name().str()); builder_.CreateStore(target_val, alloca); // check if argument type contains pointers if (!contains_ptrtype(target_arg->type())) WDEF(target_arg, "argument '{}' of aggregate type '{}' contains pointer (not supported in OpenCL 1.2)", target_arg, target_arg->type()); void_ptr = builder_.CreatePointerCast(alloca, builder_.getInt8PtrTy()); arg_type = KernelArgType::Struct; } else if (target_arg->type()->isa<PtrType>()) { auto ptr = target_arg->type()->as<PtrType>(); auto rtype = ptr->pointee(); if (!rtype->isa<ArrayType>()) EDEF(target_arg, "currently only pointers to arrays supported as kernel argument; argument has different type: {}", ptr); auto alloca = code_gen.emit_alloca(builder_.getInt8PtrTy(), target_arg->name().str()); auto target_ptr = builder_.CreatePointerCast(target_val, builder_.getInt8PtrTy()); builder_.CreateStore(target_ptr, alloca); void_ptr = builder_.CreatePointerCast(alloca, builder_.getInt8PtrTy()); arg_type = KernelArgType::Ptr; } else { // normal variable auto alloca = code_gen.emit_alloca(target_val->getType(), target_arg->name().str()); builder_.CreateStore(target_val, alloca); void_ptr = builder_.CreatePointerCast(alloca, builder_.getInt8PtrTy()); arg_type = KernelArgType::Val; } auto arg_ptr = builder_.CreateInBoundsGEP(args, llvm::ArrayRef<llvm::Value*>{builder_.getInt32(0), builder_.getInt32(i)}); auto size_ptr = builder_.CreateInBoundsGEP(sizes, llvm::ArrayRef<llvm::Value*>{builder_.getInt32(0), builder_.getInt32(i)}); auto align_ptr = builder_.CreateInBoundsGEP(aligns, llvm::ArrayRef<llvm::Value*>{builder_.getInt32(0), builder_.getInt32(i)}); auto type_ptr = builder_.CreateInBoundsGEP(types, llvm::ArrayRef<llvm::Value*>{builder_.getInt32(0), builder_.getInt32(i)}); auto size = layout_.getTypeStoreSize(target_val->getType()); if (auto struct_type = llvm::dyn_cast<llvm::StructType>(target_val->getType())) { // In the case of a structure, do not include the padding at the end in the size auto last_elem = struct_type->getStructNumElements() - 1; auto last_offset = layout_.getStructLayout(struct_type)->getElementOffset(last_elem); size = last_offset + layout_.getTypeStoreSize(struct_type->getStructElementType(last_elem)); } builder_.CreateStore(void_ptr, arg_ptr); builder_.CreateStore(builder_.getInt32(size), size_ptr); builder_.CreateStore(builder_.getInt32(layout_.getABITypeAlignment(target_val->getType())), align_ptr); builder_.CreateStore(builder_.getInt8((uint8_t)arg_type), type_ptr); } // allocate arrays for the grid and block size const auto get_u32 = [&](const Def* def) { return builder_.CreateSExt(code_gen.lookup(def), builder_.getInt32Ty()); }; llvm::Value* grid_array = llvm::UndefValue::get(llvm::ArrayType::get(builder_.getInt32Ty(), 3)); grid_array = builder_.CreateInsertValue(grid_array, get_u32(it_space->op(0)), 0); grid_array = builder_.CreateInsertValue(grid_array, get_u32(it_space->op(1)), 1); grid_array = builder_.CreateInsertValue(grid_array, get_u32(it_space->op(2)), 2); llvm::Value* grid_size = code_gen.emit_alloca(grid_array->getType(), ""); builder_.CreateStore(grid_array, grid_size); llvm::Value* block_array = llvm::UndefValue::get(llvm::ArrayType::get(builder_.getInt32Ty(), 3)); block_array = builder_.CreateInsertValue(block_array, get_u32(it_config->op(0)), 0); block_array = builder_.CreateInsertValue(block_array, get_u32(it_config->op(1)), 1); block_array = builder_.CreateInsertValue(block_array, get_u32(it_config->op(2)), 2); llvm::Value* block_size = code_gen.emit_alloca(block_array->getType(), ""); builder_.CreateStore(block_array, block_size); std::vector<llvm::Value*> gep_first_elem{builder_.getInt32(0), builder_.getInt32(0)}; grid_size = builder_.CreateInBoundsGEP(grid_size, gep_first_elem); block_size = builder_.CreateInBoundsGEP(block_size, gep_first_elem); args = builder_.CreateInBoundsGEP(args, gep_first_elem); sizes = builder_.CreateInBoundsGEP(sizes, gep_first_elem); aligns = builder_.CreateInBoundsGEP(aligns, gep_first_elem); types = builder_.CreateInBoundsGEP(types, gep_first_elem); launch_kernel(target_device, file_name, kernel_name, grid_size, block_size, args, sizes, aligns, types, builder_.getInt32(num_kernel_args)); return continuation->arg(LaunchArgs::Return)->as_continuation(); }
int main (int argc, char *argv[]) { uint64_t skip = 0; uint64_t left = -1; if (argc >= 2) skip = atoll (argv[1]); if (argc >= 3) left = atoll (argv[2]); printf ("Loading Kernel...\n"); const char *filename = KERNEL_SRC; struct stat s; if (stat (filename, &s) == -1) { fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__); return (-1); } FILE *fp = fopen (filename, "rb"); if (fp == NULL) { fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__); return (-1); } char *source_buf = (char *) malloc (s.st_size + 1); if (!fread (source_buf, sizeof (char), s.st_size, fp)) { fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__); return (-1); } source_buf[s.st_size] = 0; fclose (fp); const char *sourceBuf[] = { source_buf }; const size_t sourceLen[] = { s.st_size + 1 }; printf ("Initializing OpenCL...\n"); cl_platform_id platform; cl_uint num_devices = 0; cl_device_id devices[MAX_PLATFORM]; gc_clGetPlatformIDs (1, &platform, NULL); gc_clGetDeviceIDs (platform, DEV_TYPE, MAX_PLATFORM, devices, &num_devices); gpu_ctx_t gpu_ctxs[MAX_GPU]; memset (gpu_ctxs, 0, sizeof (gpu_ctxs)); for (cl_uint device_id = 0; device_id < num_devices; device_id++) { cl_device_id device = devices[device_id]; cl_context context = gc_clCreateContext (NULL, 1, &device, NULL, NULL); cl_program program = gc_clCreateProgramWithSource (context, 1, sourceBuf, sourceLen); gc_clBuildProgram (program, 1, &device, BUILD_OPTS, NULL, NULL); cl_kernel kernel = gc_clCreateKernel (program, KERNEL_NAME); cl_command_queue command_queue = gc_clCreateCommandQueue (context, device, 0); cl_uint max_compute_units; gc_clGetDeviceInfo (device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (max_compute_units), &max_compute_units, NULL); char device_name[BUFSIZ]; memset (device_name, 0, sizeof (device_name)); gc_clGetDeviceInfo (device, CL_DEVICE_NAME, sizeof (device_name), &device_name, NULL); printf ("Found new device #%2d: %s, %u compute units\n", device_id, device_name, max_compute_units); const int num_threads = GPU_THREADS; const int num_elements = max_compute_units * num_threads * GPU_ACCEL; /** * GPU memory */ const size_t size_block = num_elements * sizeof (block_t); const size_t size_results = num_threads * sizeof (uint32_t); cl_mem d_block = gc_clCreateBuffer (context, CL_MEM_READ_ONLY, size_block, NULL); cl_mem d_results = gc_clCreateBuffer (context, CL_MEM_WRITE_ONLY, size_results, NULL); gc_clSetKernelArg (kernel, 0, sizeof (cl_mem), (void *) &d_block); gc_clSetKernelArg (kernel, 1, sizeof (cl_mem), (void *) &d_results); /** * Host memory */ block_t *h_block = (block_t *) malloc (size_block); uint32_t *h_results = (uint32_t *) malloc (size_results); memset (h_results, 0xff, size_results); gc_clEnqueueWriteBuffer (command_queue, d_results, CL_TRUE, 0, size_results, h_results, 0, NULL, NULL); /** * Buffers for candidates */ uint8_t **plains_buf = (uint8_t **) calloc (num_elements * VECT_SIZE, sizeof (uint8_t *)); for (int i = 0; i < num_elements * VECT_SIZE; i++) { /* Agreed, this is not nice. But who cares nowadays? */ plains_buf[i] = (uint8_t *) malloc (MAX_LINELEN); } size_t *plains_len = (size_t *) calloc (num_elements * VECT_SIZE, sizeof (size_t)); gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id]; gpu_ctx->context = context; gpu_ctx->program = program; gpu_ctx->kernel = kernel; gpu_ctx->command_queue = command_queue; gpu_ctx->max_compute_units = max_compute_units; gpu_ctx->d_block = d_block; gpu_ctx->d_results = d_results; gpu_ctx->h_block = h_block; gpu_ctx->h_results = h_results; gpu_ctx->num_threads = num_threads; gpu_ctx->num_elements = num_elements; gpu_ctx->plains_buf = plains_buf; gpu_ctx->plains_len = plains_len; } /* static salt */ const uint8_t salt_buf[16] = { 0x97, 0x48, 0x6C, 0xAA, 0x22, 0x5F, 0xE8, 0x77, 0xC0, 0x35, 0xCC, 0x03, 0x73, 0x23, 0x6D, 0x51 }; const size_t salt_len = sizeof (salt_buf); /* main loop */ printf ("Initialization done, accepting candidates from stdin...\n\n"); cl_uint cur_device_id = 0; while (!feof (stdin)) { /* Get new password candidate from stdin */ uint8_t line_buf[MAX_LINELEN]; int cur_c = 0; int prev_c = 0; size_t line_len = 0; for (size_t i = 0; i < MAX_LINELEN - 100; i++) // - 100 = we need some space for salt and padding { cur_c = getchar (); if (cur_c == EOF) break; if ((prev_c == '\n') && (cur_c == '\0')) { line_len--; break; } line_buf[line_len] = cur_c; line_len++; prev_c = cur_c; } /* chop \r if it exists for some reason (in case user used a dictionary) */ if (line_len >= 2) { if ((prev_c == '\r') && (cur_c == '\0')) line_len -= 2; } /* skip empty lines */ if (line_len == 0) continue; /* The following enables distributed computing / resume work */ if (skip) { skip--; continue; } if (left) { left--; } else { break; } /* Append constant salt */ memcpy (line_buf + line_len, salt_buf, salt_len); line_len += salt_len; /* Generate digest out of it */ uint32_t digest[4]; md5_transform ((uint32_t *) line_buf, (uint32_t) line_len, digest); /* Next garanteed free GPU */ gpu_ctx_t *gpu_ctx = &gpu_ctxs[cur_device_id]; /* Save original buffer in case it cracks it */ memcpy (gpu_ctx->plains_buf[gpu_ctx->num_cached], line_buf, line_len - salt_len); gpu_ctx->plains_len[gpu_ctx->num_cached] = line_len - salt_len; /* Next garanteed free memory element on that GPU */ const uint32_t element_div = gpu_ctx->num_cached / 4; const uint32_t element_mod = gpu_ctx->num_cached % 4; /* Copy new digest */ gpu_ctx->h_block[element_div].A[element_mod] = digest[0]; gpu_ctx->h_block[element_div].B[element_mod] = digest[1]; gpu_ctx->h_block[element_div].C[element_mod] = digest[2]; gpu_ctx->h_block[element_div].D[element_mod] = digest[3]; gpu_ctx->num_cached++; /* If memory elements on that GPU are full, switch to the next GPU */ if ((gpu_ctx->num_cached / VECT_SIZE) < gpu_ctx->num_elements) continue; cur_device_id++; /* If there is no more GPU left, run the calculation */ if (cur_device_id < num_devices) continue; /* Fire! */ calc_work (num_devices, gpu_ctxs); launch_kernel (num_devices, gpu_ctxs); /* Collecting data has a blocking effect */ check_results (num_devices, gpu_ctxs); /* Reset buffer state */ for (cl_uint device_id = 0; device_id < num_devices; device_id++) { gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id]; gpu_ctx->num_cached = 0; } cur_device_id = 0; } /* Final calculation of leftovers */ calc_work (num_devices, gpu_ctxs); launch_kernel (num_devices, gpu_ctxs); check_results (num_devices, gpu_ctxs); return -1; }
void compute_process() { int np, pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &np); int server_process = np - 1; MPI_Status status; int num_comp_nodes = np -1; unsigned int num_bytes = sizeof(sAgents); unsigned int num_halo_points = RADIO * world_width; unsigned int num_halo_bytes = num_halo_points * sizeof(int); size_t size_world = world_width * world_height * sizeof(int); int *h_world = (int *)malloc(size_world); int *d_world; int left_neighbor = (pid > 0) ? (pid - 1) : MPI_PROC_NULL; int right_neighbor = (pid < np -2) ? (pid + 1) : MPI_PROC_NULL; for(int j = 0; j < world_width * world_height; j++) { h_world[j] = 0; } sAgents h_agents_in, h_agents_left_node, h_agents_right_node; float4 h_agents_pos[agents_total], h_agents_ids[agents_total]; float4 *d_agents_pos, *d_agents_ids; unsigned int num_bytes_agents = agents_total * sizeof(float4); int world_height_node = world_height / num_comp_nodes; // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; // Allocate the device pointer err = cudaMalloc((void **)&d_world, size_world); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMalloc((void **)&d_agents_pos, num_bytes_agents); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMalloc((void **)&d_agents_ids, num_bytes_agents); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } MPI_Recv(&h_agents_in, num_bytes, MPI_BYTE, server_process, 0, MPI_COMM_WORLD, &status); for(int i = 0; i < agents_total; i++) { //identify the active agents according to the y coordinate and set the busy cells in the world if( ( round(h_agents_in.pos[i].y) >= (pid * world_height_node) ) and ( round(h_agents_in.pos[i].y) < ( (pid + 1) * world_height_node ) ) ) { h_agents_in.ids[i].y = 1; h_world[(int)round( (world_width * (h_agents_in.pos[i].y - 1) ) + h_agents_in.pos[i].x )] = h_agents_in.ids[i].x; } //Copy the data to a local arrays h_agents_pos[i] = h_agents_in.pos[i]; h_agents_ids[i] = h_agents_in.ids[i]; } err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } //for(int it = 0; it < nreps ; it++) while(1) { int it=4; err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes_agents, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_agents_ids, h_agents_ids, num_bytes_agents, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } launch_kernel(d_agents_pos, d_agents_ids, d_world, world_width, world_height, agent_width, agent_height, world_height_node, pid ); cudaMemcpy(h_agents_pos, d_agents_pos, num_bytes_agents, cudaMemcpyDeviceToHost); cudaMemcpy(h_agents_ids, d_agents_ids, num_bytes_agents, cudaMemcpyDeviceToHost); //copy the data to the struct for( int i = 0; i < agents_total; i++) { h_agents_in.pos[i] = h_agents_pos[i]; h_agents_in.ids[i] = h_agents_ids[i]; } MPI_Barrier(MPI_COMM_WORLD); MPI_Send(&h_agents_in, num_bytes, MPI_BYTE, server_process, DATA_COLLECT, MPI_COMM_WORLD); #ifdef DEBUG //printf("pid: %d\n", pid); //display_data(h_agents_in); #endif // send data to left, get data from right MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, left_neighbor, it, &h_agents_right_node, num_bytes, MPI_BYTE, right_neighbor, it, MPI_COMM_WORLD, &status); // send data to right, get data from left MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, right_neighbor, it, &h_agents_left_node, num_bytes, MPI_BYTE, left_neighbor, it, MPI_COMM_WORLD, &status); for( int i = 0; i < agents_total; i++) { if(pid != np-2) { if(h_agents_right_node.ids[i].y == 2) { h_agents_in.pos[i] = h_agents_right_node.pos[i]; h_agents_pos[i] = h_agents_right_node.pos[i]; h_agents_in.ids[i].y = 1; h_agents_ids[i].y = 1; } } if(pid != 0) { if(h_agents_left_node.ids[i].y == 3) { h_agents_in.pos[i] = h_agents_left_node.pos[i]; h_agents_pos[i] = h_agents_left_node.pos[i]; h_agents_in.ids[i].y = 1; h_agents_ids[i].y = 1; } } } /*** if(pid == 1) { printf("pid: %d\n", pid); display_data(h_agents_in); display_data(h_agents_right_node); display_data(h_agents_left_node); } ***/ } /* Release resources */ // free(h_agents_in); /* free(h_output); cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary); cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo); cudaFree(d_input); cudaFree(d_output); */ }
void compute_process(int agents_total, int nreps, int world_width, int world_height) { int np, pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &np); int server_process = np - 1; MPI_Status status; /* create a type for struct agent */ const int nitems=5; int blocklengths[5] = {1,1,1,1,1}; MPI_Datatype types[5] = {MPI_INT, MPI_INT, MPI_INT, MPI_FLOAT, MPI_FLOAT}; MPI_Datatype mpi_agent_type; MPI_Aint offsets[5]; offsets[0] = offsetof(agent, id); offsets[1] = offsetof(agent, x); offsets[2] = offsetof(agent, y); offsets[3] = offsetof(agent, z); offsets[4] = offsetof(agent, w); MPI_Type_create_struct(nitems, blocklengths, offsets, types, &mpi_agent_type); MPI_Type_commit(&mpi_agent_type); unsigned int num_bytes = agents_total * sizeof(float4); unsigned int num_halo_points = RADIO * world_width; unsigned int num_halo_bytes = num_halo_points * sizeof(short int); //unsigned int world_node_height = (world_height / (np-1)) + (RADIO * 2); //if(pid == 0 or pid == np - 2) // world_node_height -= RADIO; size_t size_world = world_width * world_height * sizeof(short int); short int *h_world = (short int *)malloc(size_world); *h_world = 0; short int *d_world; for(int j = 0; j < world_width * world_height; j++) { h_world[j] = 0; } /* alloc host memory */ agent *h_agents_in = (agent *)malloc(num_bytes); //agent *d_agents_in; float4 *h_agents_pos; float4 *d_agents_pos; //MPI_Recv(rcv_address, num_points, MPI_FLOAT, server_process, MPI_ANY_TAG, MPI_COMM_WORLD, &status); MPI_Recv(h_agents_in, agents_total, mpi_agent_type, server_process, 0, MPI_COMM_WORLD, &status); //Iniatialize world for( int i = 0; i < agents_total; i++) { h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] = (h_agents_in[i].x!=0?1:0); //if(h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] == 1) //printf("world x: %d, y: %d\n", h_agents_in[i].x, h_agents_in[i].y); h_agents_pos[i].x = h_agents_in[i].x; h_agents_pos[i].y = h_agents_in[i].y; h_agents_pos[i].z = h_agents_in[i].z; h_agents_pos[i].w = h_agents_in[i].w; } /*** if(pid ==1) { int k=0; for(int j = 0; j < world_width * world_height; j++) { if ( j%96 == 0 and j>0) { k++; printf("%d row: %d\n", h_world[j], k); } else printf("%d ", h_world[j]); } } ***/ // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; // Allocate the device pointer err = cudaMalloc((void **)&d_world, size_world); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } //http://cuda-programming.blogspot.com.es/2013/02/cuda-array-in-cuda-how-to-use-cuda.html //http://stackoverflow.com/questions/17924705/structure-of-arrays-vs-array-of-structures-in-cuda // Allocate the device pointer err = cudaMalloc((void **)&d_agents_pos, num_bytes); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } launch_kernel(d_agents_pos, d_world, world_width, world_height ); MPI_Barrier( MPI_COMM_WORLD); #ifdef DEBUG // printf("pid: %d\n", pid); // display_data(h_agents_in, agents_total ); #endif MPI_Send(h_agents_in, agents_total, mpi_agent_type, server_process, DATA_COLLECT, MPI_COMM_WORLD); /* Release resources */ free(h_agents_in); /* free(h_output); cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary); cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo); cudaFree(d_input); cudaFree(d_output); */ }