void ocl_make_fieldbu_irregular(int nr_lines, int field_nr_points, int linelength, const cl_double4* vecs, const double* y, cl_double4* RF, double resolutionfactor) { icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL); icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL); icl_write_buffer(RF_buf, CL_FALSE, sizeof(cl_double4) * field_nr_points, &RF[0], NULL, NULL); size_t size = field_nr_points; size_t szLocalWorkSize = LOCAL_GROUP_SIZE; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; printf("\n\nocl_make_field_irregular with blocking and unrolling (%d), global size %d, local size %d\n", LOOP_UNROLL, szGlobalWorkSize, szLocalWorkSize); icl_run_kernel(irr_fieldbu_kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6, sizeof(cl_int), (void *)&nr_lines, sizeof(cl_int), (void *)&linelength, (size_t)0, (void *)vecs_buf, (size_t)0, (void *)y_buf, (size_t)0, (void *)RF_buf, sizeof(cl_double), (void *)&resolutionfactor ); icl_read_buffer(RF_buf, CL_TRUE, sizeof(cl_double4) * field_nr_points, &RF[0], NULL, NULL); icl_finish(device); }
void ocl_make_field1bu(int nr_lines, int fielddim_x, int fielddim_y, int fielddim_z, int linelength, const cl_double4* vecs, const double* y, double* RF, double resolutionfactor) { icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL); icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL); size_t szGlobalWorkSize[1] = { fielddim_x*fielddim_y*fielddim_z }; size_t szLocalWorkSize[1] = { LOCAL_GROUP_SIZE }; printf("\n\nocl_make_field1 with blocking and unrolling factor %d, global size (%d), local size (%d)\n", LOOP_UNROLL, szGlobalWorkSize[0], szLocalWorkSize[0]); icl_run_kernel(reg_field1bu_kernel, 1, szGlobalWorkSize, szLocalWorkSize, NULL, NULL, 9, sizeof(cl_int), (void *)&nr_lines, sizeof(cl_int), (void *)&fielddim_x, sizeof(cl_int), (void *)&fielddim_y, sizeof(cl_int), (void *)&fielddim_z, sizeof(cl_int), (void *)&linelength, (size_t)0, (void *)vecs_buf, (size_t)0, (void *)y_buf, (size_t)0, (void *)RF_buf, sizeof(cl_double), (void *)&resolutionfactor ); icl_read_buffer(RF_buf, CL_TRUE, sizeof(cl_double) * fielddim_x * fielddim_y * fielddim_z, &RF[0], NULL, NULL); icl_finish(device); }
/* unrool of all loops */ void ocl_make_field3(int nr_lines, int fielddim_x, int fielddim_y, int fielddim_z, int linelength, const cl_double4* vecs, const double* y, double* RF, double resolutionfactor) { icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL); icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL); size_t szLocalWorkSize = LOCAL_GROUP_SIZE; size_t size = nr_lines * fielddim_x * fielddim_y * fielddim_z; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; printf("\n\nocl_make_field3, global size (%d), local size (%d)\n", szGlobalWorkSize, szLocalWorkSize); icl_run_kernel(reg_field3_kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 9, sizeof(cl_int), (void *)&nr_lines, sizeof(cl_int), (void *)&fielddim_x, sizeof(cl_int), (void *)&fielddim_y, sizeof(cl_int), (void *)&fielddim_z, sizeof(cl_int), (void *)&linelength, (size_t)0, (void *)vecs_buf, (size_t)0, (void *)y_buf, (size_t)0, (void *)RF_buf, sizeof(cl_double), (void *)&resolutionfactor ); icl_read_buffer(RF_buf, CL_TRUE, sizeof(cl_double) * fielddim_x * fielddim_y * fielddim_z, &RF[0], NULL, NULL); icl_finish(device); }
int main(int argc, const char* argv[]) { icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); chdir(PATH); int size = args->size; icl_print_args(args); cl_float4* output = (cl_float4*)malloc(sizeof(cl_float4) * size); icl_init_devices(args->device_type); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(args->device_id); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "sinewave.cl", "sinewave", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * size); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 2, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(cl_float4) * size, &output[0], NULL, NULL); icl_release_buffer(buf_output); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); // for the test check printf("Result check: OK\n"); icl_release_args(args); icl_release_devices(); free(output); }
void out_snapshot(struct Particle *particles_host, icl_buffer* particles_device, const UINT nParticles, icl_device* dev, const FLOAT current_time) { int j; particle_data *P = (particle_data*)malloc(sizeof(particle_data) * nParticles); io_header header; static int cs = 0; char fn[200]; sprintf(fn, "./output/snapshot_%03d", cs); icl_read_buffer(particles_device, CL_TRUE, sizeof(struct Particle) * nParticles, particles_host, NULL, NULL); energy_statistic(particles_host, nParticles, current_time); /// // printf("\tid: %d pos: %g %g %g vel: %g %g %g\n", particles_host[0].id, particles_host[0].pos.x, particles_host[0].pos.y, particles_host[0].pos.z, particles_host[0].vel.x, particles_host[0].vel.y, particles_host[0].vel.z); /// for(j = 0; j < nParticles; ++j) { P[j].Pos[0] = particles_host[j].pos.x; P[j].Pos[1] = particles_host[j].pos.y; P[j].Pos[2] = particles_host[j].pos.z; P[j].Vel[0] = particles_host[j].vel.x; P[j].Vel[1] = particles_host[j].vel.y; P[j].Vel[2] = particles_host[j].vel.z; //P[j].Mass = particles_host[j].mass; //in header P[j].Id = particles_host[j].id; P[j].Accel[0] = particles_host[j].acc.x; P[j].Accel[1] = particles_host[j].acc.y; P[j].Accel[2] = particles_host[j].acc.z; } memset(&header, 0, sizeof(io_header)); header.time = current_time; header.num_files = 1; header.npart[1] = header.npartTotal[1] = nParticles; header.mass[1] = particles_host[0].mass; //for now all particles have the same mass unsigned blocks = 8199; //0b10000000000111; write_snapshot_format2(fn, &header, P, blocks); cs++; free(P); }
void ocl_make_field_outcore(int nr_lines, int fielddim_x, int fielddim_y, int fielddim_z, int linelength, const cl_double4* vecs, const double* y, double* RF, double resolutionfactor) { size_t xy = fielddim_x*fielddim_y; size_t z = fielddim_z; size_t overall_size = xy * z; size_t chunk_size = OUTOFCORE_SIZE; icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL); icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL); printf("ocl_make_field out-of-core with overall size of %d, chunked by %d\n", overall_size, chunk_size); // for each chunk, we send and receive back a new part of the field for(size_t offset = 0, bid = 0; offset < overall_size; offset += chunk_size, bid++) { printf("block %d offset %d,", bid, offset); size_t current_chunk_size = MIN(overall_size-offset, chunk_size); size_t szGlobalWorkSize[1] = { current_chunk_size }; size_t szLocalWorkSize[1] = { LOCAL_GROUP_SIZE }; unsigned long long in_offset = offset; // printf("\nocl_make_field out-of-core, global size (%d), local size (%d)\n", szGlobalWorkSize[0], szLocalWorkSize[0]); icl_run_kernel(reg_field_outofcore_kernel, 1, szGlobalWorkSize, szLocalWorkSize, NULL, NULL, 10, sizeof(cl_int), (void*) &nr_lines, sizeof(cl_int), (void*) &fielddim_x, sizeof(cl_int), (void*) &fielddim_y, sizeof(cl_int), (void*) &fielddim_z, sizeof(cl_int), (void*) &linelength, (size_t)0, (void*) vecs_buf, (size_t)0, (void*) y_buf, (size_t)0, (void*) RF_buf, sizeof(cl_double), (void*) &resolutionfactor, sizeof(cl_ulong), (void*) &in_offset ); icl_read_buffer(&RF_buf[0], CL_FALSE, sizeof(cl_double) * current_chunk_size, &RF[offset], NULL, NULL); } icl_finish(device); printf("\n"); }
UINT compute_acceleration(UINT mode, icl_buffer* nodelist, icl_buffer* particles, const UINT nParticles, const FLOAT eps, UINT treeHeight, icl_device* dev, struct Particle* ref) { static icl_buffer* kdTree = NULL; if(kdTree == NULL) //just for the first time to initialize acceleration for opening criterion { kdTree = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct KdNode) * (nParticles * 2 - 1)); treeHeight = buildTree(nodelist, particles, kdTree, nParticles, dev); // first walk through the entire tree since acceleration is 0 printf("First tree walk\n"); walk(kdTree, particles, nParticles, eps, dev); #if timing == 1 // store acceleration of particles by unfolding the entire tree = direct force summation, used for correctness validation printf("Pruned tree walk\n"); icl_read_buffer(particles, CL_TRUE, sizeof(struct Particle) * nParticles, &ref[0], NULL, NULL); // second walk with pruned tree walk(kdTree, particles, nParticles, eps, dev); #endif } else if(mode == 1) //rebuild the tree { treeHeight = buildTree(nodelist, particles, kdTree, nParticles, dev); walk(kdTree, particles, nParticles, eps, dev); } else if(mode == 2) //TODO: dynamic tree update { updateTree(nodelist, particles, kdTree, nParticles, treeHeight, dev); walk(kdTree, particles, nParticles, eps, dev); } else //end of sim, release kdTree buffer { icl_release_buffer(kdTree); } return treeHeight; }
int main(int argc, const char* argv[]) { chdir(PATH); icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); int size = args->size; cl_uint* ma = (cl_uint*) malloc(sizeof(cl_uint) * size); cl_uint* b = (cl_uint*) malloc(sizeof(cl_uint) * size); cl_uint* c = (cl_uint*) malloc(sizeof(cl_uint) * size); cl_uint* seed = (cl_uint*) malloc(sizeof(cl_uint) * size); cl_float4* result = (cl_float4*)malloc(sizeof(cl_float4) * size); for (cl_uint i = 0; i < size; ++i) { ma[i] = i; b[i] = i; c[i] = i; seed[i] = i; } icl_init_devices(args->device_type); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(args->device_id); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "mers_twister.cl", "mersenne_twister", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_ma = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size); icl_buffer* buf_b = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size); icl_buffer* buf_c = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size); icl_buffer* buf_seed = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size); icl_buffer* buf_result = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * size); icl_write_buffer(buf_ma, CL_FALSE, sizeof(cl_uint) * size, &ma[0], NULL, NULL); icl_write_buffer(buf_b, CL_FALSE, sizeof(cl_uint) * size, &b[0], NULL, NULL); icl_write_buffer(buf_c, CL_FALSE, sizeof(cl_uint) * size, &c[0], NULL, NULL); icl_write_buffer(buf_seed, CL_TRUE, sizeof(cl_uint) * size, &seed[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6, (size_t)0, (void *)buf_ma, (size_t)0, (void *)buf_b, (size_t)0, (void *)buf_c, (size_t)0, (void *)buf_seed, (size_t)0, (void *)buf_result, sizeof(cl_int), (void *)&size ); icl_read_buffer(buf_result, CL_TRUE, sizeof(cl_float4) * size, &result[0], NULL, NULL); icl_release_buffers(5, buf_ma, buf_b, buf_c, buf_seed, buf_result); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); if (args->check_result) { printf("======================\n= mersenne twister test\n"); printf("Check not Implemented\n"); printf("Result check: OK\n"); } else { printf("Result check: OK\n"); } icl_release_devices(); free(ma); free(b); free(c); free(seed); free(result); return 0; }
//main simulation loop void run(const FLOAT t_max, const FLOAT eps, const FLOAT ErrTolIntAccuracy, struct Particle *particles_host, icl_buffer* particles_device, const UINT nParticles, icl_device* dev, struct Tree *tree, struct Particle *ref) { UINT k = 0; UINT treeHeight = 0; //FLOAT dt=1.5e-6; //FLOAT dt=1.220703125e-5; FLOAT dt=3.05176e-06; FLOAT current_time = 0.0; //time before current full timestep (drift), kicks are at current_time-+dt/2.0 FLOAT timeBetSnapshot = 1e-3; FLOAT timeLastSnapshot = 0.0; /* tree->nodelist[0].center_of_mass.x = 1; tree->nodelist[0].center_of_mass.y = 2; tree->nodelist[0].center_of_mass.z = 3; tree->nodelist[0].center_geometric.x = 1.3; tree->nodelist[0].center_geometric.y = 2.2; tree->nodelist[0].center_geometric.z = 3.1; tree->nodelist[0].mass = 77.0; tree->nodelist[0].l = 42.7; tree->nodelist[0].bounding_box.box[0].x = -3.0; tree->nodelist[0].bounding_box.box[0].y = -3.2; tree->nodelist[0].bounding_box.box[0].z = -3.3; tree->nodelist[0].bounding_box.box[1].x = 3.0; tree->nodelist[0].bounding_box.box[1].y = 3.2; tree->nodelist[0].bounding_box.box[1].z = 3.3; tree->nodelist[0].size = 8; tree->nodelist[0].level = 7; tree->nodelist[0].address = 17; tree->nodelist[0].left_child = 1; tree->nodelist[0].right_child = 2; tree->nodelist[0].split_dim = 3; */ // create root node in nodelist tree->nodelist[0].particlesLow = 0; tree->nodelist[0].particlesHigh = nParticles; tree->nodelist[0].level = 0; //tree->nodelist[0].bounding_box = bounding_box; tree->nodelist[0].address = 0; icl_buffer* nodelist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Node) * (nParticles * 2 - 1)); // copy root node to ocl device icl_write_buffer(nodelist, CL_TRUE, sizeof(struct Node), &(tree->nodelist[0]), NULL, NULL); //snapshot_000 IC with computed code properties (acceleration...) // out_snapshot(particles_host, particles_device, nParticles, dev, 0.0); // TODO particles have been resorted during tree construction. Upload them in original sorting for comparison, not needed for correctness REMOVE IT! // icl_write_buffer(particles_device, CL_TRUE, sizeof(struct Particle) * nParticles, &particles_host[0], NULL, NULL); //just for the first time, kick to half timestep treeHeight = compute_acceleration(1, nodelist, particles_device, nParticles, eps, treeHeight, dev, ref); #if timing == 1 return; #endif //dt = calcTimestep(eps, ErrTolIntAccuracy, particles, nParticles); kick(dt/2.0, particles_device, nParticles, dev); // out_snapshot(particles_host, particles_device, nParticles, dev, 0.0); //TEST icl_read_buffer(particles_device, CL_TRUE, sizeof(struct Particle) * nParticles, particles_host, NULL, NULL); printf("\tid: %d pos: %g %g %g vel: %g %g %g\n", particles_host[0].id, particles_host[0].pos.x, particles_host[0].pos.y, particles_host[0].pos.z, particles_host[0].vel.x, particles_host[0].vel.y, particles_host[0].vel.z); printf("\tacc: %g %g %g\n", particles_host[0].acc.x, particles_host[0].acc.y, particles_host[0].acc.z); // while(current_time < t_max) { current_time += dt; printf("___step: %d time: %g\n", k++, current_time); // printf("\tid: %d pos: %g %g %g vel: %g %g %g\n", ) //drift to next full timestep at current_time drift(dt, particles_device, nParticles, dev); //get new accelerations treeHeight = compute_acceleration(2, nodelist, particles_device, nParticles, eps, treeHeight, dev, ref); //TODO: mode 2: implement dynamic tree update //kick particles to current_time+dt/2.0 kick(dt, particles_device, nParticles, dev); //output & energy statistic if(current_time-timeLastSnapshot > timeBetSnapshot) { out_snapshot(particles_host, particles_device, nParticles, dev, current_time); timeLastSnapshot = current_time; } //TEST icl_read_buffer(particles_device, CL_TRUE, sizeof(struct Particle) * nParticles, particles_host, NULL, NULL); printf("\tid: %d pos: %g %g %g vel: %g %g %g\n", particles_host[0].id, particles_host[0].pos.x, particles_host[0].pos.y, particles_host[0].pos.z, particles_host[0].vel.x, particles_host[0].vel.y, particles_host[0].vel.z); printf("\tacc: %g %g %g\n", particles_host[0].acc.x, particles_host[0].acc.y, particles_host[0].acc.z); } out_snapshot(particles_host, particles_device, nParticles, dev, current_time); //write a snapshot also for the final time printf("final time reached: %g\n", current_time); compute_acceleration(0, nodelist, particles_device, nParticles, eps, treeHeight, dev, ref); //clean up icl_release_buffer(nodelist); }
UINT buildTree(icl_buffer *nodelist, icl_buffer *particlesD, icl_buffer *treeD, UINT nParticles, icl_device* dev) { UINT level = 1; UINT nNodes = nParticles * 2 - 1; icl_timer* timer = icl_init_timer(ICL_MILLI); // void icl_start_timer(icl_timer* timer); double time = 0; // overapproximate size of temporal lists /* struct Node** activelist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT activeN = 0; struct Node** smalllist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT smallN = 0; struct Node** nextlist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT nextN = 0;*/ icl_buffer* activelist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles); icl_buffer* smalllist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles); icl_buffer* nextlist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles); icl_buffer* sizes = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(UINT) * 5); // holds the current size of each of 3 buffers: // 0 activelist // 1 nodelist // 2 smalllist // 3 old max level // 4 new max level UINT maxNchunks = ((nParticles / fmin(T, chunk_size)) * 2) -1; // assert(maxNchunks <= 256 && "adapt implementation"); // TODO allow more than 256 chunks per node icl_buffer* chunks = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Chunk) * maxNchunks); icl_buffer* bboxes = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct BBox) * maxNchunks); size_t localSize = 1; size_t globalSize = 1; /* struct Particle* particles = (struct Particle*)malloc(3000 * sizeof(struct Particle)); icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * 3000, &particles[0], NULL, NULL); printf("%f %f %f\n", particles[0].pos.x, particles[0].pos.y, particles[0].pos.z); */ // compile OpenCL kernels icl_kernel* init = icl_create_kernel(dev, "kernel/init.cl", "init", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* resetChunks = icl_create_kernel(dev, "kernel/init.cl", "memset_chunks", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* gp2c = icl_create_kernel(dev, "kernel/groupToChunks.cl", "groupParticlesIntoChunks", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* cBBox = icl_create_kernel(dev, "kernel/chunkedBBox.cl", "chunkedBBox", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* bBox = icl_create_kernel(dev, "kernel/bBox.cl", "bBox", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* sln = icl_create_kernel(dev, "kernel/splitLargeNodes.cl", "splitLargeNodes", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* sortP = icl_create_kernel(dev, "kernel/sortParticlesToChilds.cl", "sortParticlesToChilds", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* snf = icl_create_kernel(dev, "kernel/smallNodeFiltering.cl", "smallNodeFiltering", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* pnl = icl_create_kernel(dev, "kernel/packNextlist.cl", "packNextlist", KERNEL_BUILD_MACRO, ICL_SOURCE); ////////////////////////////////////////////////////////////////////////// icl_kernel* preScan = icl_create_kernel(dev, "kernel/sortP_prescan.cl", "sortP_prescan_chunked", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* postScan = icl_create_kernel(dev, "kernel/sortP_postscan.cl", "sortP_postscan_chunked", KERNEL_BUILD_MACRO, ICL_SOURCE); segmented_scan_init(nParticles, dev, KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* memset_int_s = icl_create_kernel(dev, "kernel/init.cl", "memset_int_s", KERNEL_BUILD_MACRO, ICL_SOURCE); // approach with segmented scan icl_buffer *scan_data = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(int) * nParticles); icl_buffer *scan_flag = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(int) * nParticles); icl_buffer* buffered_particles = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Particle) * nParticles); #if timing == 1 icl_timer* timer_gp2c = icl_init_timer(ICL_MILLI); icl_timer* timer_cBBox = icl_init_timer(ICL_MILLI); icl_timer* timer_bBox = icl_init_timer(ICL_MILLI); icl_timer* timer_sln = icl_init_timer(ICL_MILLI); icl_timer* timer_sortP = icl_init_timer(ICL_MILLI); icl_timer* timer_snf = icl_init_timer(ICL_MILLI); icl_timer* timer_pnl = icl_init_timer(ICL_MILLI); icl_timer* timer_ran = icl_init_timer(ICL_MILLI); icl_timer* timer_prescan = icl_init_timer(ICL_MILLI); icl_timer* timer_scan = icl_init_timer(ICL_MILLI); icl_timer* timer_postscan = icl_init_timer(ICL_MILLI); #endif //add root node to the activelist and initialize size lists icl_run_kernel(init, 1, &globalSize, &localSize, NULL, NULL, 3, (size_t)0, (void *)activelist, (size_t)0, (void *)sizes, (size_t)0, (void *)particlesD); UINT activeN = 1; icl_finish(dev); // smallest power of 2 bigger or equal to maxxNchnunks UINT pow2maxNchunks = pow2roundup(maxNchunks); // processLargeNode while(activeN != 0) { icl_start_timer(timer); // group triangles into chunks size_t localSize1 = min(pow2maxNchunks, 256); #if timing == 1 icl_start_timer(timer_gp2c); #endif size_t globalSize1 = ((maxNchunks + localSize1 -1) / localSize1) * localSize1; // reset chunks icl_run_kernel(resetChunks, 1, &globalSize1, &localSize1, NULL, NULL, 2, (size_t)0, (void *)chunks, sizeof(UINT), &maxNchunks); globalSize1 = localSize1 * activeN; // split every node in chunk of chunk_size icl_run_kernel(gp2c, 1, &globalSize1, &localSize1, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)activelist, (size_t)0, (void *)chunks, sizeof(UINT), &activeN); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_gp2c); #endif // compute per chunk bounding box size_t localSize2 = chunk_size; size_t globalSize2 = maxNchunks * chunk_size; #if timing == 1 icl_start_timer(timer_cBBox); #endif icl_run_kernel(cBBox, 1, &globalSize2, &localSize2, NULL, NULL, 5, (size_t)0, (void *)nodelist, (size_t)0, (void *)activelist, (size_t)0, (void *)particlesD, (size_t)0, (void *)chunks, (size_t)0, (void *)bboxes); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_cBBox); #endif // compute per node bounding box size_t localSize3 = min(pow2maxNchunks, 256); size_t globalSize3 = localSize3 * activeN; #if timing == 1 icl_start_timer(timer_bBox); #endif icl_run_kernel(bBox, 1, &globalSize3, &localSize3, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)activelist, (size_t)0, (void *)bboxes, sizeof(UINT), &activeN); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_bBox); #endif // split large nodes size_t localSize4 = 256; size_t globalSize4 = ((activeN + 255) / 256) * 256; #if timing == 1 icl_start_timer(timer_sln); #endif icl_run_kernel(sln, 1, &globalSize4, &localSize4, NULL, NULL, 5, (size_t)0, (void *)nodelist, (size_t)0, (void *)activelist, (size_t)0, (void *)nextlist, (size_t)0, (void *)sizes, sizeof(UINT), &activeN); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_sln); #endif /////////////////////////////////////////////////////////////////////////////// // XXx replaced with segmented scan //globalSize = (activeN+1) * 256; #if timing == 1 icl_start_timer(timer_sortP); #endif #if DEVICE == ICL_CPU // sort particles to child nodes size_t localSize5 = 256; size_t globalSize5 = ((activeN + 255) / 256) * 256; icl_run_kernel(sortP, 1, &globalSize5, &localSize5, NULL, NULL, 5, (size_t)0, (void *)nodelist, (size_t)0, (void *)particlesD, (size_t)0, (void *)activelist, (size_t)0, (void *)nextlist, sizeof(UINT), &activeN ); #else // init scan_flag to 1 cl_int initFlag = 1; size_t np = (size_t)((nParticles + localSize4 -1 ) / localSize4) * localSize4; icl_run_kernel(memset_int_s, 1, &np, &localSize4, NULL, NULL, 3, (size_t)0, (void *)scan_flag, sizeof(cl_int), &initFlag, sizeof(UINT), &nParticles ); #if timing == 1 icl_start_timer(timer_prescan); #endif // pre-scan fills data0 and data1 with 1 and 0 whenever value < pivot localSize = chunk_size; // globalSize = activeN * 256; globalSize = maxNchunks * chunk_size; icl_run_kernel(preScan, 1, &globalSize, &localSize, NULL, NULL, 7, (size_t)0, (void *)nodelist, (size_t)0, (void *)chunks, (size_t)0, (void *)particlesD, (size_t)0, (void *)activelist, sizeof(UINT), &activeN, (size_t)0, (void *)scan_data, (size_t)0, (void *)scan_flag ); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_prescan); #endif #if timing == 1 icl_start_timer(timer_scan); #endif // scan for scan(scan_data, scan_flag, nParticles); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_scan); #endif // copy partially sorted data to the final icl_copy_buffer(particlesD, buffered_particles, sizeof(struct Particle) * nParticles, NULL, NULL); // swap(particlesD, buffered_particles); #if timing == 1 icl_start_timer(timer_postscan); #endif localSize = chunk_size; globalSize = maxNchunks * chunk_size; icl_run_kernel(postScan, 1, &globalSize, &localSize, NULL, NULL, 7, (size_t)0, (void *)nodelist, (size_t)0, (void *)chunks, (size_t)0, (void *)particlesD, (size_t)0, (void *)activelist, sizeof(UINT), &activeN, (size_t)0, (void *)buffered_particles, (size_t)0, (void *)scan_data ); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_postscan); #endif icl_finish(dev); #endif /////////////////////////////////////////////////////////////////////////////// #if timing == 1 icl_finish(dev); icl_stop_timer(timer_sortP); #endif // small node filtering size_t localSize6 = 256; size_t globalSize6 = ((activeN*2 + 255) / 256) * 256; #if timing == 1 icl_start_timer(timer_snf); #endif icl_run_kernel(snf, 1, &globalSize6, &localSize6, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)nextlist, (size_t)0, (void *)smalllist, (size_t)0, (void *)sizes //, sizeof(UINT), &nParticles ); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_snf); #endif // packing of nextlist size_t localSize7 = 1; size_t globalSize7 = 1; #if timing == 1 icl_start_timer(timer_pnl); #endif icl_run_kernel(pnl, 1, &globalSize7, &localSize7, NULL, NULL, 2, (size_t)0, (void *)nextlist, (size_t)0, (void *)sizes ); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_pnl); #endif // swap nextlist and activelist swap(&nextlist, &activelist); #if timing == 1 icl_start_timer(timer_ran); #endif // read size of next activelist set in kernel icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL); icl_finish(dev); #if timing == 1 icl_stop_timer(timer_ran); #endif ++level; //printf("%d: ActiveN %d\n", level, activeN); time = icl_stop_timer(timer); } icl_release_kernel(init); icl_release_kernel(gp2c); icl_release_kernel(cBBox); icl_release_kernel(bBox); icl_release_kernel(sln); icl_release_kernel(sortP); icl_release_kernel(snf); icl_release_kernel(pnl); ////////////////////////////////////////////////////////////////////////// icl_release_kernel(preScan); icl_release_kernel(postScan); segmented_scan_release(); icl_release_buffers(3, scan_data, scan_flag, buffered_particles); #if timing == 1 printf("gp2c %f\ncBBox %f\nbBox %f\nsln %f\nsortP %f\nsnf %f\npnl %f\nran %f\n\n", timer_gp2c->current_time, timer_cBBox->current_time, timer_bBox->current_time, timer_sln->current_time, timer_sortP->current_time, timer_snf->current_time, timer_pnl->current_time, timer_ran->current_time); icl_release_timer(timer_gp2c); icl_release_timer(timer_cBBox); icl_release_timer(timer_bBox); icl_release_timer(timer_sln); icl_release_timer(timer_sortP); icl_release_timer(timer_snf); icl_release_timer(timer_pnl); printf("prescan %f\nscan %f\npostscan %f\n\n", timer_prescan->current_time, timer_scan->current_time, timer_postscan->current_time); icl_release_timer(timer_prescan); icl_release_timer(timer_scan); icl_release_timer(timer_postscan); #endif /* icl_read_buffer(nodelist, CL_TRUE, sizeof(struct Node) * 6000, tree->nodelist, NULL, NULL); printf("node: %d, left %d, right %d", tree->nodelist[0].particlesHigh - tree->nodelist[0].particlesLow, tree->nodelist[1].particlesHigh - tree->nodelist[1].particlesLow, tree->nodelist[2].particlesHigh - tree->nodelist[2].particlesLow); printBox(tree->nodelist[49].bounding_box); printBox(tree->nodelist[53].bounding_box); printBox(tree->nodelist[54].bounding_box); for(int i = 0; i < 6000; ++i) if(tree->nodelist[i].bounding_box.box[0].x != 0.0) printBox(tree->nodelist[i].bounding_box); */ //small nodes stage // preprocessSmallNodes(smalllist); icl_release_buffers(3, activelist, chunks, bboxes); icl_kernel* sasl = icl_create_kernel(dev, "kernel/swapActiveAndSmalllist.cl", "swapActiveAndSmalllist", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* ssn = icl_create_kernel(dev, "kernel/splitSmallNodes.cl", "splitSmallNodes", KERNEL_BUILD_MACRO, ICL_SOURCE); #if timing == 1 icl_timer* timer_ssn = icl_init_timer(ICL_MILLI); icl_timer* timer_sasl = icl_init_timer(ICL_MILLI); icl_timer* timer_rsn = icl_init_timer(ICL_MILLI); #endif size_t localSize8 = 1; size_t globalSize8 = 1; UINT setMaxLevel = 0; icl_run_kernel(sasl, 1, &globalSize8, &localSize8, NULL, NULL, 2, (size_t)0, (void *)sizes, sizeof(UINT), &level); // get number of small nodes icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL); while(activeN != 0) { icl_start_timer(timer); // compute SVH and determine the split plane size_t localSize9 = 256; size_t globalSize9 = ((activeN + 255) / 256) * 256; #if timing == 1 icl_start_timer(timer_ssn); #endif icl_run_kernel(ssn, 1, &globalSize9, &localSize9, NULL, NULL, 5, (size_t)0, (void *)nodelist, (size_t)0, (void *)smalllist, (size_t)0, (void *)nextlist, (size_t)0, (void *)particlesD, (size_t)0, (void *)sizes); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_ssn); #endif size_t localSizeA = 1; size_t globalSizeA = 1; #if timing == 1 icl_start_timer(timer_sasl); #endif icl_run_kernel(sasl, 1, &globalSizeA, &localSizeA, NULL, NULL, 2, (size_t)0, (void *)sizes, sizeof(UINT), &setMaxLevel); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_sasl); #endif swap(&nextlist, &smalllist); // read size of next activelist set in kernel #if timing == 1 icl_start_timer(timer_rsn); #endif icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL); //printf("small size %d\n", activeN); icl_finish(dev); #if timing == 1 icl_stop_timer(timer_rsn); #endif time = icl_stop_timer(timer); } icl_release_buffer(smalllist); icl_release_buffer(nextlist); icl_release_kernel(sasl); icl_release_kernel(ssn); #if timing == 1 printf("ssn %f\nsasl %f\nrsn %f\n\n", timer_ssn->current_time, timer_sasl->current_time, timer_rsn->current_time); icl_release_timer(timer_ssn); icl_release_timer(timer_sasl); icl_release_timer(timer_rsn); #endif UINT s[5]; icl_read_buffer(sizes, CL_TRUE, sizeof(UINT) * 5, &s, NULL, NULL); icl_release_buffer(sizes); icl_kernel* upPass = icl_create_kernel(dev, "kernel/upPass.cl", "upPass", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* downPass = icl_create_kernel(dev, "kernel/kdDownPass.cl", "downPass", KERNEL_BUILD_MACRO, ICL_SOURCE); #if timing == 1 icl_timer* timer_upPass = icl_init_timer(ICL_MILLI); icl_timer* timer_downPass = icl_init_timer(ICL_MILLI); icl_timer* timer_rt = icl_init_timer(ICL_MILLI); #endif UINT treeHeight = s[4]; printf("Tree height: %d\n", treeHeight); size_t localSizeB = 256; size_t globalSizeB = ((nNodes + 255) / 256) * 256; icl_start_timer(timer); #if timing == 1 icl_start_timer(timer_upPass); #endif for(int l = (int)treeHeight; l >= 0; --l) { icl_run_kernel(upPass, 1, &globalSizeB, &localSizeB, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)particlesD, sizeof(int), &l, sizeof(UINT), &nNodes); } #if timing == 1 icl_finish(dev); icl_stop_timer(timer_upPass); icl_start_timer(timer_downPass); #endif for(UINT l = 0; l <= treeHeight; ++l) { icl_run_kernel(downPass, 1, &globalSizeB, &localSizeB, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)treeD, sizeof(UINT), &l, sizeof(UINT), &nNodes); } icl_finish(dev); #if timing == 1 icl_stop_timer(timer_downPass); #endif time = icl_stop_timer(timer); icl_release_kernel(upPass); icl_release_kernel(downPass); #if timing == 1 icl_start_timer(timer_rt); #endif #if timing == 1 icl_finish(dev); icl_stop_timer(timer_rt); printf("upPass %f\ndownPass %f\nread Tree %f\n\n", timer_upPass->current_time, timer_downPass->current_time, timer_rt->current_time); icl_release_timer(timer_upPass); icl_release_timer(timer_downPass); icl_release_timer(timer_rt); #endif // struct Node* kdTree = (struct Node*)malloc(sizeof(struct Node) * nNodes); // icl_read_buffer(treeD, CL_TRUE, sizeof(struct Node) * nNodes, kdTree, NULL, NULL); // printf("%d", tree->nodelist[0].left_child); printf("\nTime: %f\n", time); icl_release_timer(timer); return treeHeight; }
int main (int argc, char **argv) { struct BBox box; struct Particle *particles; particle_data *P; io_header header; int tot = snapshotLoader(argv[1], &header, &P); int k = 0; if(tot <= 0) { printf("error while loading snapshot file\n"); return -1; } initBBox2(&box); particles = (struct Particle*)malloc(header.npartTotal[1] * sizeof(struct Particle)); UINT* particleIds = (UINT*)malloc(header.npartTotal[1] * sizeof(UINT)); //for(int j = header.npartTotal[0]; j < header.npartTotal[0]+header.npartTotal[1]; ++j) #define F 1 for(int j = header.npartTotal[0]; j < header.npartTotal[0]+header.npartTotal[1]; j += F) { particles[k].pos.x = P[j].Pos[0]; particles[k].pos.y = P[j].Pos[1]; particles[k].pos.z = P[j].Pos[2]; particles[k].vel.x = P[j].Vel[0]; particles[k].vel.y = P[j].Vel[1]; particles[k].vel.z = P[j].Vel[2]; particles[k].mass = P[j].Mass; particles[k].id = P[j].Id; particleIds[k] = P[j].Id; //printf("%f %f %f\n", particles[k].pos.x, particles[k].pos.y, particles[k].pos.z); //get bbox /* if(particles[k].pos.x < box.box[0].x) box.box[0].x = particles[k].pos.x; if(particles[k].pos.y < box.box[0].y) box.box[0].y = particles[k].pos.y; if(particles[k].pos.z < box.box[0].z) box.box[0].z = particles[k].pos.z; if(particles[k].pos.x >= box.box[1].x) box.box[1].x = particles[k].pos.x; if(particles[k].pos.y >= box.box[1].y) box.box[1].y = particles[k].pos.y; if(particles[k].pos.z >= box.box[1].z) box.box[1].z = particles[k].pos.z; */ ++k; } free(P); header.npartTotal[1] /= F; struct Tree tree; tree.nodelist = (struct Node*)malloc(2*header.npartTotal[1] * sizeof(struct Node)); struct Particle* ref = (struct Particle*)malloc(sizeof(struct Particle) * header.npartTotal[1]); // init ocl icl_init_devices(DEVICE); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_buffer* particlesD = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Particle) * header.npartTotal[1]); // copy particles to ocl device icl_write_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &particles[0], NULL, NULL); run(1, 0.00001, 0.0025, particles, particlesD, header.npartTotal[1], dev, &tree, ref); // icl_buffer* kdTree = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct KdNode) * (header.npartTotal[1] * 2 - 1)); // buildTree(tree.nodelist, particlesD, kdTree, header.npartTotal[1], dev); // TODO particles have been resorted during tree construction. Upload them in original sorting for comparison, not needed for correctness REMOVE IT! // icl_write_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &particles[0], NULL, NULL); // walk(kdTree, particlesD, header.npartTotal[1], 0.00001, dev); // // read particles from device, used as reference for correctness check // icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &ref[0], NULL, NULL); // // printf("Walk second time with last acceleration of particles\n"); // // walk(kdTree, particlesD, header.npartTotal[1], 0.00001, dev, particles); // // // read particles from device icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &particles[0], NULL, NULL); // // icl_release_buffer(kdTree); icl_release_buffer(particlesD); icl_release_devices(); printf("\nSUCCESS\n"); } else { printf("ERROR! Cannot find requested device\n"); return -1; } // check_force("forcetest_1e5.txt", "result.txt", particles, particleIds, header.npartTotal[1]); // check_force("forcetest.txt", "result.txt", particles, particleIds, header.npartTotal[1]); #if timing == 1 check_force_internal("result.txt", ref, particles, particleIds, header.npartTotal[1]); #endif // display interactions, stored in each particle at acc.x /*FLOAT average = 0; for(UINT i = 0; i < header.npartTotal[1]; ++i) { average += particles[i].acc.x; } printf("Average number of interactions: %f\n", average/header.npartTotal[1]); */ free(tree.nodelist); free(particles); free(ref); return 0; }
/************************************************************************** Function: ocl_jacobi This routine contains the main iteration loop for the Jacobi iteration using OpenCL kernel. params: a two arrays to compute solution into max_iter maximum number of iterations size size of array for this MPI rank tolerance all differences should be les than this tolerance value mpi_ranks number of MPI ranks in each dimension rank_pos cartesian position of this rank origin origin for this rank d discretion size mpi_comm MPI communications structure local_workblock_size size of local workblock for OpenCL kernel device_type OpenCL device type full_copy boolean if full buffer copy is to be done **************************************************************************/ static void ocl_jacobi(value_type *a[2], unsigned int max_iter, size_t size[DIMENSIONS], value_type tolerance, value_type d[DIMENSIONS], size_t local_workblock_size[DIMENSIONS], cl_device_type device_type, unsigned int full_copy) { size_t array_size; unsigned int i, j, rc, iter = 0; size_t delta_buffer_size, delta_size[DIMENSIONS]; size_t tile_delta_size, tile_cache_size; value_type max_diff, timer; icl_device* device_id; icl_kernel* kernel; cl_int err; icl_buffer *a_buf[2], *delta_buf; value_type *delta; /* convenience for y stride in array */ cl_uint ystride = size[Y]+2*GHOST_CELL_WIDTH; /* init devices */ icl_init_devices(device_type); /* find OpenCL device */ device_id = icl_get_device(0); /* build the kernel and verify the kernel */ kernel = icl_create_kernel(device_id, "jacsolver_kernel.cl", "ocl_jacobi_local_copy", "", ICL_SOURCE); /* calculate size of kernel local memory - also used later for kernel params */ tile_delta_size = local_workblock_size[X] * local_workblock_size[Y]; tile_cache_size = (local_workblock_size[X]+2*GHOST_CELL_WIDTH) * (local_workblock_size[Y]+2*GHOST_CELL_WIDTH); /* verify the device has enough resources for this device */ /* I'm an optimist, we just hope for the best if ((cluGetAvailableLocalMem(device_id, kernel) < tile_delta_size + tile_cache_size) || (! cluCheckLocalWorkgroupSize(device_id, kernel, DIMENSIONS, local_workblock_size))) { local_workblock_size[X] = 1; local_workblock_size[Y] = 1; } */ printf("Estimating solution using OpenCL Jacobi iteration with %d x %d workblock.\n", (int)local_workblock_size[X], (int)local_workblock_size[Y]); fflush(stdout); /* init arrays by setting the initial value and the boundary conditions */ set_initial_solution(a[OLD], size, INITIAL_GUESS); set_initial_solution(a[NEW], size, INITIAL_GUESS); set_boundary_conditions(a[OLD], size, d); set_boundary_conditions(a[NEW], size, d); /* print the initial solution guess */ print_array("Init ", a[NEW], size, d); /* allocate memory for differences */ delta_size[X] = size[X] / local_workblock_size[X]; delta_size[Y] = size[Y] / local_workblock_size[Y]; delta_buffer_size = delta_size[X] * delta_size[Y]; delta = (value_type *)malloc(sizeof(value_type) * delta_buffer_size); /* initialize deltas so that first execution of kernel with overlapping * reduction on the host will work correctly and not prematurely exit */ for (i=0; i<delta_size[X]; ++i) { for (j=0; j<delta_size[Y]; ++j) { delta[i * delta_size[Y] + j] = 1.0; } } /* create buffers for OpenCL device using host memory */ array_size = (size[X]+2*GHOST_CELL_WIDTH) * ystride; a_buf[OLD] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size); a_buf[NEW] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size); delta_buf = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * delta_buffer_size); /* copy over buffers to device */ icl_write_buffer(a_buf[OLD], CL_TRUE, sizeof(value_type) * array_size, a[OLD], NULL, NULL); icl_write_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL); /* set the kernel execution type - data parallel */ // cluSetKernelNDRange(clu, kernel, DIMENSIONS, NULL, size, local_workblock_size); /* iterate until maximum difference is less than the given tolerance or number of iterations is too high */ do { /* swap array pointers for next iteration */ SWAP_PTR(a[OLD], a[NEW]); SWAP_BUF(a_buf[OLD], a_buf[NEW]); icl_run_kernel(kernel, DIMENSIONS, size, local_workblock_size, NULL, NULL, 6, (size_t)0,(void *) a_buf[OLD], (size_t)0, (void *) a_buf[NEW], sizeof(value_type) * tile_delta_size, NULL, sizeof(value_type) * tile_cache_size, NULL, (size_t)0, (void *) delta_buf, sizeof(cl_uint), (void *) &ystride); /* while the kernel is running, calculate the reduction for the previous iteration */ max_diff = ocl_jacobi_reduce(delta, delta_size); /* enqueue a synchronous copy of the delta. This will not occur until the kernel * has finished. The deltas for each workgroup is a much smaller array to process */ icl_read_buffer(delta_buf, CL_TRUE, sizeof(value_type) * delta_buffer_size, delta, NULL, NULL); // clEnqueueReadBuffer(queue, a_buf[NEW], CL_TRUE, 0, sizeof(value_type) * array_size, a[NEW], 0, NULL, NULL)); /* output status for user, overwrite the same line */ if ((0 == iter % 100)) { printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r", iter, max_diff, tolerance); fflush(stdout); } /* increment the iteration counter */ iter++; } while (max_diff > tolerance && max_iter >= iter); /* do loop */ /* read back the final result */ icl_read_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL); /* output final iteration count and maximum difference value */ printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n", iter-1, max_diff, timer); fflush(stdout); /* finish usage of OpenCL device */ icl_release_buffers(3, a_buf[OLD], a_buf[NEW], delta_buf); icl_release_kernel(kernel); free(delta); }
int main(int argc, const char* argv[]) { icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); chdir(PATH); // int dim = 128; int size = args->size; int nRef = 100000; float* ref = (float*)malloc(sizeof(float) * nRef /* dim*/); float* query = (float*)malloc(sizeof(float) * size /* dim*/); float* dists = (float *)malloc(sizeof(float) * size); int* neighbors = (int*)malloc(sizeof(int) * size); srand(42); for(int i=0; i < nRef/*dim*/; ++i) { ref[i] = rand(); } for(int i=0; i < size/**dim*/; ++i) { query[i] = rand(); } icl_init_devices(ICL_ALL); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "knn.cl", "knn", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_ref = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * nRef /* dim*/); icl_buffer* buf_query = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size /* dim*/); icl_buffer* buf_dists = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(float) * size); icl_buffer* buf_neighbors = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size); icl_write_buffer(buf_ref, CL_TRUE, sizeof(float) * nRef /*dim*/, &ref[0], NULL, NULL); icl_write_buffer(buf_query, CL_TRUE, sizeof(float) * size /*dim*/, &query[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6, (size_t)0, (void *)buf_ref, (size_t)0, (void *)buf_query, (size_t)0, (void *)buf_dists, (size_t)0, (void *)buf_neighbors, sizeof(cl_int), (void *)&nRef, sizeof(cl_int), (void *)&size); // sizeof(cl_int), (void *)&dim); icl_read_buffer(buf_dists, CL_TRUE, sizeof(float) * size, &dists[0], NULL, NULL); icl_read_buffer(buf_neighbors, CL_TRUE, sizeof(int) * size, &neighbors[0], NULL, NULL); icl_release_buffers(4, buf_ref, buf_query, buf_dists, buf_neighbors); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); if (args->check_result) { printf("======================\n= KNN program working\n"); unsigned int check = 1; unsigned int sum = 0; for(int i = 0; i < size; ++i) { if(dists[i] < 0) check = 0; if(neighbors[i] < 0 || neighbors[i] >= nRef) check = 0; } printf("======================\n"); printf("Result check: %s\n", check ? "OK" : "FAIL"); } else { printf("Result check: OK\n"); } icl_release_args(args); icl_release_devices(); free(ref); free(query); free(dists); free(neighbors); }
int main(int argc, const char* argv[]) { chdir(PATH); icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); int size = args->size; // this is the size of chunking - so far as big as the local size int chunkSize = 16; cl_float16* input = (cl_float16*)malloc(sizeof(cl_float16) * size); float* output = (float*)malloc(sizeof(float) * size); fillrandom_float((float*)input,size, chunkSize, 0.001f ,100000.f); icl_init_devices(args->device_type); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(args->device_id); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "reduction_chunking.cl", "reduce", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_input = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_float16) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(float) * size); icl_write_buffer(buf_input, CL_FALSE, sizeof(cl_float16) * size, &input[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 4, (size_t)0, (void *)buf_input, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&chunkSize, sizeof(cl_int), (void *)&size ); icl_read_buffer(buf_output, CL_TRUE, sizeof(float) * size, &output[0], NULL, NULL); icl_release_buffers(2, buf_input, buf_output); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); // printf("Chunks' minimum \n"); // out_float_hbuffer(output, size); if (args->check_result) { printf("======================\n= Reduction test\n"); unsigned int check = 1; float host_min = 100000.f; float* testInput = (float*)input; for(unsigned int i = 0; i < size*chunkSize; ++i) if(testInput[i] < host_min) host_min = testInput[i]; printf("Host minimum is %f\n", host_min); float device_min = 100000.f; for(unsigned int i = 0; i < size; ++i) if(output[i] < device_min) device_min = output[i]; printf("Device minimum is %f\n", device_min); printf("Result check: %s\n", (device_min == host_min) ? "OK" : "FAIL"); } else { printf("Result check: OK\n"); } icl_release_devices(); free(input); free(output); #ifdef _MSC_VER icl_prompt(); #endif return 0; }
int main(int argc, const char* argv[]) { icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); chdir(PATH); int size = args->size; float* input1 = (float*) malloc(sizeof(float) * size); float* input2 = (float*) malloc(sizeof(float) * size); float* alpha = (float*) malloc(sizeof(float) * size); float* beta = (float*) malloc(sizeof(float) * size); float* output = (float*) malloc(sizeof(float) * size); fill_random_float(input2, size, 1, -1.0f, 1.0f); qsort(input2, size, sizeof(float), float_compare); float step = 2.0f / size; for(int i=0; i < size; i++) input1[i] = -1.0f + i * step; fill_random_float(alpha, size, 1, -1.0f, 1.0f); fill_random_float(beta, size, 1, -1.0f, 1.0f); icl_init_devices(args->device_type); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(args->device_id); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "lin_reg.cl", "lin_reg", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size); icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size); icl_buffer* buf_alpha = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size); icl_buffer* buf_beta = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(float) * size); icl_write_buffer(buf_input1, CL_TRUE, sizeof(float) * size, &input1[0], NULL, NULL); icl_write_buffer(buf_input2, CL_TRUE, sizeof(float) * size, &input2[0], NULL, NULL); icl_write_buffer(buf_alpha, CL_TRUE, sizeof(float) * size, &alpha[0], NULL, NULL); icl_write_buffer(buf_beta, CL_TRUE, sizeof(float) * size, &beta[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6, (size_t)0, (void *)buf_input1, (size_t)0, (void *)buf_input2, (size_t)0, (void *)buf_alpha, (size_t)0, (void *)buf_beta, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(float) * size, &output[0], NULL, NULL); icl_release_buffers(5, buf_input1, buf_input2, buf_alpha, buf_beta, buf_output); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); if (args->check_result) { printf("======================\n= Linear Regression Done\n"); float* output2 = (float *)malloc(sizeof(float) * size); for(unsigned int j = 0; j < size; ++j) { const int gid = j; float a = alpha[gid]; float b = beta[gid]; float error = 0; for(int i=0; i<size; i++) { float e = (a * input1[i] + b) - input2[i]; error += e * e; } output2[gid] = error; } bool check = compare_float(output, output2, size, 0.000001); printf("======================\n"); printf("Result check: %s\n", check ? "OK" : "FAIL"); free(output2); } else { printf("Result check: OK\n"); } icl_release_args(args); icl_release_devices(); free(input1); free(input2); free(alpha); free(beta); free(output); }
int main(int argc, const char* argv[]) { icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); chdir(PATH); int size = args->size; int* input = (int*)malloc(sizeof(int) * size); int* output = (int *)malloc(sizeof(int) * size); for(int i=0; i < size; ++i) { input[i] = i; } icl_init_devices(ICL_ALL); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "simple.cl", "simple", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_input = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size); icl_write_buffer(buf_input, CL_TRUE, sizeof(int) * size, &input[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 3, (size_t)0, (void *)buf_input, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], NULL, NULL); icl_release_buffers(2, buf_input, buf_output); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); if (args->check_result) { printf("======================\n= Simple program working\n"); unsigned int check = 1; for(unsigned int i = 0; i < size; ++i) { if(output[i] != input[i]) { check = 0; printf("= fail at %d, expected %d / actual %d", i, i, output[i]); break; } } printf("======================\n"); printf("Result check: %s\n", check ? "OK" : "FAIL"); } else { printf("Result check: OK\n"); } icl_release_args(args); icl_release_devices(); free(input); free(output); }
int main(int argc, char* argv[]) { int size = 1000; int* input1 = (int*)malloc(sizeof(int) * size); int* input2 = (int*) malloc(sizeof(int) * size); int* output = (int *)malloc(sizeof(int) * size); for(int i=0; i < size; ++i) { input1[i] = i; input2[i] = 1; } #ifndef INSIEME icl_timer* time1 = icl_init_timer(ICL_SEC); icl_start_timer(time1); #endif icl_init_devices(ICL_CPU); #ifndef INSIEME printf("TIME for initialization: %f\n", icl_stop_timer(time1)); #endif if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "vec_mul.cl", "vec_mul", "", ICL_SOURCE); icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size); icl_event* wb1 = icl_create_event(); icl_event* wb2 = icl_create_event(); icl_event* rb = icl_create_event(); icl_write_buffer(buf_input1, CL_FALSE, sizeof(int) * size, &input1[0], NULL, wb1); icl_write_buffer(buf_input2, CL_FALSE, sizeof(int) * size, &input2[0], NULL, wb2); size_t szLocalWorkSize = 256; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; icl_event* rk = icl_create_event(); icl_event* wb_all = icl_create_event_list(2, wb1, wb2); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, wb_all, rk, 4, (size_t)0, (void *)buf_input1, (size_t)0, (void *)buf_input2, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], rk, rb); printf("Time wb1 %f\n", icl_profile_event(wb1, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time wb2 %f\n", icl_profile_event(wb2, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time rk %f\n", icl_profile_event(rk, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time rb %f\n", icl_profile_event(rb, ICL_STARTED, ICL_FINISHED, ICL_SEC)); icl_release_events(5, wb1, wb2, wb_all, rk, rb); icl_release_buffers(3, buf_input1, buf_input2, buf_output); icl_release_kernel(kernel); } #ifndef INSIEME icl_restart_timer(time1); #endif icl_release_devices(); #ifndef INSIEME printf("TIME for releasing the devices: %f\n", icl_stop_timer(time1)); icl_release_timer(time1); #endif // CHECK for output printf("======================\n= Vector Mul Done\n"); unsigned int check = 1; for(unsigned int i = 0; i < size; ++i) { if(output[i] != i*size) { check = 0; printf("= fail at %d, expected %d / actual %d", i, i*3/2, output[i]); break; } } printf("= result check: %s\n======================\n", check ? "OK" : "FAIL"); free(input1); free(input2); free(output); }