void updateTree(icl_buffer *nodelist, icl_buffer *particlesD, icl_buffer *treeD, UINT nParticles, UINT treeHeight, icl_device* dev) { // compile OpenCL kernels icl_kernel* update = icl_create_kernel(dev, "kernel/updateTree.cl", "updateTree", KERNEL_BUILD_MACRO, ICL_SOURCE); UINT nNodes = nParticles * 2 - 1; #if timing == 1 icl_timer* timer_update = icl_init_timer(ICL_MILLI); #endif size_t localSizeB = 256; size_t globalSizeB = ((nNodes + 255) / 256) * 256; #if timing == 1 icl_start_timer(timer_update); #endif for(int l = (int)treeHeight; l >= 0; --l) { icl_run_kernel(update, 1, &globalSizeB, &localSizeB, NULL, NULL, 5, (size_t)0, (void *)nodelist, (size_t)0, (void *)treeD, (size_t)0, (void *)particlesD, sizeof(int), &l, sizeof(UINT), &nNodes); } #if timing == 1 icl_finish(dev); icl_stop_timer(timer_update); printf("tree update %f\n\n", timer_update->current_time); #endif }
void ocl_make_fieldbu_irregular(int nr_lines, int field_nr_points, int linelength, const cl_double4* vecs, const double* y, cl_double4* RF, double resolutionfactor) { icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL); icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL); icl_write_buffer(RF_buf, CL_FALSE, sizeof(cl_double4) * field_nr_points, &RF[0], NULL, NULL); size_t size = field_nr_points; size_t szLocalWorkSize = LOCAL_GROUP_SIZE; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; printf("\n\nocl_make_field_irregular with blocking and unrolling (%d), global size %d, local size %d\n", LOOP_UNROLL, szGlobalWorkSize, szLocalWorkSize); icl_run_kernel(irr_fieldbu_kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6, sizeof(cl_int), (void *)&nr_lines, sizeof(cl_int), (void *)&linelength, (size_t)0, (void *)vecs_buf, (size_t)0, (void *)y_buf, (size_t)0, (void *)RF_buf, sizeof(cl_double), (void *)&resolutionfactor ); icl_read_buffer(RF_buf, CL_TRUE, sizeof(cl_double4) * field_nr_points, &RF[0], NULL, NULL); icl_finish(device); }
/* unrool of all loops */ void ocl_make_field3(int nr_lines, int fielddim_x, int fielddim_y, int fielddim_z, int linelength, const cl_double4* vecs, const double* y, double* RF, double resolutionfactor) { icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL); icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL); size_t szLocalWorkSize = LOCAL_GROUP_SIZE; size_t size = nr_lines * fielddim_x * fielddim_y * fielddim_z; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; printf("\n\nocl_make_field3, global size (%d), local size (%d)\n", szGlobalWorkSize, szLocalWorkSize); icl_run_kernel(reg_field3_kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 9, sizeof(cl_int), (void *)&nr_lines, sizeof(cl_int), (void *)&fielddim_x, sizeof(cl_int), (void *)&fielddim_y, sizeof(cl_int), (void *)&fielddim_z, sizeof(cl_int), (void *)&linelength, (size_t)0, (void *)vecs_buf, (size_t)0, (void *)y_buf, (size_t)0, (void *)RF_buf, sizeof(cl_double), (void *)&resolutionfactor ); icl_read_buffer(RF_buf, CL_TRUE, sizeof(cl_double) * fielddim_x * fielddim_y * fielddim_z, &RF[0], NULL, NULL); icl_finish(device); }
void ocl_make_field1bu(int nr_lines, int fielddim_x, int fielddim_y, int fielddim_z, int linelength, const cl_double4* vecs, const double* y, double* RF, double resolutionfactor) { icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL); icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL); size_t szGlobalWorkSize[1] = { fielddim_x*fielddim_y*fielddim_z }; size_t szLocalWorkSize[1] = { LOCAL_GROUP_SIZE }; printf("\n\nocl_make_field1 with blocking and unrolling factor %d, global size (%d), local size (%d)\n", LOOP_UNROLL, szGlobalWorkSize[0], szLocalWorkSize[0]); icl_run_kernel(reg_field1bu_kernel, 1, szGlobalWorkSize, szLocalWorkSize, NULL, NULL, 9, sizeof(cl_int), (void *)&nr_lines, sizeof(cl_int), (void *)&fielddim_x, sizeof(cl_int), (void *)&fielddim_y, sizeof(cl_int), (void *)&fielddim_z, sizeof(cl_int), (void *)&linelength, (size_t)0, (void *)vecs_buf, (size_t)0, (void *)y_buf, (size_t)0, (void *)RF_buf, sizeof(cl_double), (void *)&resolutionfactor ); icl_read_buffer(RF_buf, CL_TRUE, sizeof(cl_double) * fielddim_x * fielddim_y * fielddim_z, &RF[0], NULL, NULL); icl_finish(device); }
void drift(const FLOAT dt, icl_buffer* particles, const UINT nParticles, icl_device* dev) { icl_kernel* tw = icl_create_kernel(dev, "kernel/timestep.cl", "drift_particles", "-Iinclude", ICL_SOURCE); size_t localSize = 256; size_t globalSize = ((nParticles + 255) / 256) * 256; icl_run_kernel(tw, 1, &globalSize, &localSize, NULL, NULL, 3, (size_t)0, (void *)particles, sizeof(UINT), &nParticles, sizeof(FLOAT), &dt); icl_finish(dev); icl_release_kernel(tw); }
void ocl_make_field_outcore(int nr_lines, int fielddim_x, int fielddim_y, int fielddim_z, int linelength, const cl_double4* vecs, const double* y, double* RF, double resolutionfactor) { size_t xy = fielddim_x*fielddim_y; size_t z = fielddim_z; size_t overall_size = xy * z; size_t chunk_size = OUTOFCORE_SIZE; icl_write_buffer(vecs_buf, CL_FALSE, sizeof(cl_double4) * nr_lines, &vecs[0], NULL, NULL); icl_write_buffer(y_buf, CL_FALSE, sizeof(cl_double) * linelength*nr_lines+1, &y[0], NULL, NULL); printf("ocl_make_field out-of-core with overall size of %d, chunked by %d\n", overall_size, chunk_size); // for each chunk, we send and receive back a new part of the field for(size_t offset = 0, bid = 0; offset < overall_size; offset += chunk_size, bid++) { printf("block %d offset %d,", bid, offset); size_t current_chunk_size = MIN(overall_size-offset, chunk_size); size_t szGlobalWorkSize[1] = { current_chunk_size }; size_t szLocalWorkSize[1] = { LOCAL_GROUP_SIZE }; unsigned long long in_offset = offset; // printf("\nocl_make_field out-of-core, global size (%d), local size (%d)\n", szGlobalWorkSize[0], szLocalWorkSize[0]); icl_run_kernel(reg_field_outofcore_kernel, 1, szGlobalWorkSize, szLocalWorkSize, NULL, NULL, 10, sizeof(cl_int), (void*) &nr_lines, sizeof(cl_int), (void*) &fielddim_x, sizeof(cl_int), (void*) &fielddim_y, sizeof(cl_int), (void*) &fielddim_z, sizeof(cl_int), (void*) &linelength, (size_t)0, (void*) vecs_buf, (size_t)0, (void*) y_buf, (size_t)0, (void*) RF_buf, sizeof(cl_double), (void*) &resolutionfactor, sizeof(cl_ulong), (void*) &in_offset ); icl_read_buffer(&RF_buf[0], CL_FALSE, sizeof(cl_double) * current_chunk_size, &RF[offset], NULL, NULL); } icl_finish(device); printf("\n"); }
UINT buildTree(icl_buffer *nodelist, icl_buffer *particlesD, icl_buffer *treeD, UINT nParticles, icl_device* dev) { UINT level = 1; UINT nNodes = nParticles * 2 - 1; icl_timer* timer = icl_init_timer(ICL_MILLI); // void icl_start_timer(icl_timer* timer); double time = 0; // overapproximate size of temporal lists /* struct Node** activelist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT activeN = 0; struct Node** smalllist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT smallN = 0; struct Node** nextlist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT nextN = 0;*/ icl_buffer* activelist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles); icl_buffer* smalllist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles); icl_buffer* nextlist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles); icl_buffer* sizes = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(UINT) * 5); // holds the current size of each of 3 buffers: // 0 activelist // 1 nodelist // 2 smalllist // 3 old max level // 4 new max level UINT maxNchunks = ((nParticles / fmin(T, chunk_size)) * 2) -1; // assert(maxNchunks <= 256 && "adapt implementation"); // TODO allow more than 256 chunks per node icl_buffer* chunks = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Chunk) * maxNchunks); icl_buffer* bboxes = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct BBox) * maxNchunks); size_t localSize = 1; size_t globalSize = 1; /* struct Particle* particles = (struct Particle*)malloc(3000 * sizeof(struct Particle)); icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * 3000, &particles[0], NULL, NULL); printf("%f %f %f\n", particles[0].pos.x, particles[0].pos.y, particles[0].pos.z); */ // compile OpenCL kernels icl_kernel* init = icl_create_kernel(dev, "kernel/init.cl", "init", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* resetChunks = icl_create_kernel(dev, "kernel/init.cl", "memset_chunks", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* gp2c = icl_create_kernel(dev, "kernel/groupToChunks.cl", "groupParticlesIntoChunks", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* cBBox = icl_create_kernel(dev, "kernel/chunkedBBox.cl", "chunkedBBox", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* bBox = icl_create_kernel(dev, "kernel/bBox.cl", "bBox", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* sln = icl_create_kernel(dev, "kernel/splitLargeNodes.cl", "splitLargeNodes", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* sortP = icl_create_kernel(dev, "kernel/sortParticlesToChilds.cl", "sortParticlesToChilds", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* snf = icl_create_kernel(dev, "kernel/smallNodeFiltering.cl", "smallNodeFiltering", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* pnl = icl_create_kernel(dev, "kernel/packNextlist.cl", "packNextlist", KERNEL_BUILD_MACRO, ICL_SOURCE); ////////////////////////////////////////////////////////////////////////// icl_kernel* preScan = icl_create_kernel(dev, "kernel/sortP_prescan.cl", "sortP_prescan_chunked", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* postScan = icl_create_kernel(dev, "kernel/sortP_postscan.cl", "sortP_postscan_chunked", KERNEL_BUILD_MACRO, ICL_SOURCE); segmented_scan_init(nParticles, dev, KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* memset_int_s = icl_create_kernel(dev, "kernel/init.cl", "memset_int_s", KERNEL_BUILD_MACRO, ICL_SOURCE); // approach with segmented scan icl_buffer *scan_data = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(int) * nParticles); icl_buffer *scan_flag = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(int) * nParticles); icl_buffer* buffered_particles = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Particle) * nParticles); #if timing == 1 icl_timer* timer_gp2c = icl_init_timer(ICL_MILLI); icl_timer* timer_cBBox = icl_init_timer(ICL_MILLI); icl_timer* timer_bBox = icl_init_timer(ICL_MILLI); icl_timer* timer_sln = icl_init_timer(ICL_MILLI); icl_timer* timer_sortP = icl_init_timer(ICL_MILLI); icl_timer* timer_snf = icl_init_timer(ICL_MILLI); icl_timer* timer_pnl = icl_init_timer(ICL_MILLI); icl_timer* timer_ran = icl_init_timer(ICL_MILLI); icl_timer* timer_prescan = icl_init_timer(ICL_MILLI); icl_timer* timer_scan = icl_init_timer(ICL_MILLI); icl_timer* timer_postscan = icl_init_timer(ICL_MILLI); #endif //add root node to the activelist and initialize size lists icl_run_kernel(init, 1, &globalSize, &localSize, NULL, NULL, 3, (size_t)0, (void *)activelist, (size_t)0, (void *)sizes, (size_t)0, (void *)particlesD); UINT activeN = 1; icl_finish(dev); // smallest power of 2 bigger or equal to maxxNchnunks UINT pow2maxNchunks = pow2roundup(maxNchunks); // processLargeNode while(activeN != 0) { icl_start_timer(timer); // group triangles into chunks size_t localSize1 = min(pow2maxNchunks, 256); #if timing == 1 icl_start_timer(timer_gp2c); #endif size_t globalSize1 = ((maxNchunks + localSize1 -1) / localSize1) * localSize1; // reset chunks icl_run_kernel(resetChunks, 1, &globalSize1, &localSize1, NULL, NULL, 2, (size_t)0, (void *)chunks, sizeof(UINT), &maxNchunks); globalSize1 = localSize1 * activeN; // split every node in chunk of chunk_size icl_run_kernel(gp2c, 1, &globalSize1, &localSize1, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)activelist, (size_t)0, (void *)chunks, sizeof(UINT), &activeN); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_gp2c); #endif // compute per chunk bounding box size_t localSize2 = chunk_size; size_t globalSize2 = maxNchunks * chunk_size; #if timing == 1 icl_start_timer(timer_cBBox); #endif icl_run_kernel(cBBox, 1, &globalSize2, &localSize2, NULL, NULL, 5, (size_t)0, (void *)nodelist, (size_t)0, (void *)activelist, (size_t)0, (void *)particlesD, (size_t)0, (void *)chunks, (size_t)0, (void *)bboxes); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_cBBox); #endif // compute per node bounding box size_t localSize3 = min(pow2maxNchunks, 256); size_t globalSize3 = localSize3 * activeN; #if timing == 1 icl_start_timer(timer_bBox); #endif icl_run_kernel(bBox, 1, &globalSize3, &localSize3, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)activelist, (size_t)0, (void *)bboxes, sizeof(UINT), &activeN); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_bBox); #endif // split large nodes size_t localSize4 = 256; size_t globalSize4 = ((activeN + 255) / 256) * 256; #if timing == 1 icl_start_timer(timer_sln); #endif icl_run_kernel(sln, 1, &globalSize4, &localSize4, NULL, NULL, 5, (size_t)0, (void *)nodelist, (size_t)0, (void *)activelist, (size_t)0, (void *)nextlist, (size_t)0, (void *)sizes, sizeof(UINT), &activeN); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_sln); #endif /////////////////////////////////////////////////////////////////////////////// // XXx replaced with segmented scan //globalSize = (activeN+1) * 256; #if timing == 1 icl_start_timer(timer_sortP); #endif #if DEVICE == ICL_CPU // sort particles to child nodes size_t localSize5 = 256; size_t globalSize5 = ((activeN + 255) / 256) * 256; icl_run_kernel(sortP, 1, &globalSize5, &localSize5, NULL, NULL, 5, (size_t)0, (void *)nodelist, (size_t)0, (void *)particlesD, (size_t)0, (void *)activelist, (size_t)0, (void *)nextlist, sizeof(UINT), &activeN ); #else // init scan_flag to 1 cl_int initFlag = 1; size_t np = (size_t)((nParticles + localSize4 -1 ) / localSize4) * localSize4; icl_run_kernel(memset_int_s, 1, &np, &localSize4, NULL, NULL, 3, (size_t)0, (void *)scan_flag, sizeof(cl_int), &initFlag, sizeof(UINT), &nParticles ); #if timing == 1 icl_start_timer(timer_prescan); #endif // pre-scan fills data0 and data1 with 1 and 0 whenever value < pivot localSize = chunk_size; // globalSize = activeN * 256; globalSize = maxNchunks * chunk_size; icl_run_kernel(preScan, 1, &globalSize, &localSize, NULL, NULL, 7, (size_t)0, (void *)nodelist, (size_t)0, (void *)chunks, (size_t)0, (void *)particlesD, (size_t)0, (void *)activelist, sizeof(UINT), &activeN, (size_t)0, (void *)scan_data, (size_t)0, (void *)scan_flag ); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_prescan); #endif #if timing == 1 icl_start_timer(timer_scan); #endif // scan for scan(scan_data, scan_flag, nParticles); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_scan); #endif // copy partially sorted data to the final icl_copy_buffer(particlesD, buffered_particles, sizeof(struct Particle) * nParticles, NULL, NULL); // swap(particlesD, buffered_particles); #if timing == 1 icl_start_timer(timer_postscan); #endif localSize = chunk_size; globalSize = maxNchunks * chunk_size; icl_run_kernel(postScan, 1, &globalSize, &localSize, NULL, NULL, 7, (size_t)0, (void *)nodelist, (size_t)0, (void *)chunks, (size_t)0, (void *)particlesD, (size_t)0, (void *)activelist, sizeof(UINT), &activeN, (size_t)0, (void *)buffered_particles, (size_t)0, (void *)scan_data ); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_postscan); #endif icl_finish(dev); #endif /////////////////////////////////////////////////////////////////////////////// #if timing == 1 icl_finish(dev); icl_stop_timer(timer_sortP); #endif // small node filtering size_t localSize6 = 256; size_t globalSize6 = ((activeN*2 + 255) / 256) * 256; #if timing == 1 icl_start_timer(timer_snf); #endif icl_run_kernel(snf, 1, &globalSize6, &localSize6, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)nextlist, (size_t)0, (void *)smalllist, (size_t)0, (void *)sizes //, sizeof(UINT), &nParticles ); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_snf); #endif // packing of nextlist size_t localSize7 = 1; size_t globalSize7 = 1; #if timing == 1 icl_start_timer(timer_pnl); #endif icl_run_kernel(pnl, 1, &globalSize7, &localSize7, NULL, NULL, 2, (size_t)0, (void *)nextlist, (size_t)0, (void *)sizes ); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_pnl); #endif // swap nextlist and activelist swap(&nextlist, &activelist); #if timing == 1 icl_start_timer(timer_ran); #endif // read size of next activelist set in kernel icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL); icl_finish(dev); #if timing == 1 icl_stop_timer(timer_ran); #endif ++level; //printf("%d: ActiveN %d\n", level, activeN); time = icl_stop_timer(timer); } icl_release_kernel(init); icl_release_kernel(gp2c); icl_release_kernel(cBBox); icl_release_kernel(bBox); icl_release_kernel(sln); icl_release_kernel(sortP); icl_release_kernel(snf); icl_release_kernel(pnl); ////////////////////////////////////////////////////////////////////////// icl_release_kernel(preScan); icl_release_kernel(postScan); segmented_scan_release(); icl_release_buffers(3, scan_data, scan_flag, buffered_particles); #if timing == 1 printf("gp2c %f\ncBBox %f\nbBox %f\nsln %f\nsortP %f\nsnf %f\npnl %f\nran %f\n\n", timer_gp2c->current_time, timer_cBBox->current_time, timer_bBox->current_time, timer_sln->current_time, timer_sortP->current_time, timer_snf->current_time, timer_pnl->current_time, timer_ran->current_time); icl_release_timer(timer_gp2c); icl_release_timer(timer_cBBox); icl_release_timer(timer_bBox); icl_release_timer(timer_sln); icl_release_timer(timer_sortP); icl_release_timer(timer_snf); icl_release_timer(timer_pnl); printf("prescan %f\nscan %f\npostscan %f\n\n", timer_prescan->current_time, timer_scan->current_time, timer_postscan->current_time); icl_release_timer(timer_prescan); icl_release_timer(timer_scan); icl_release_timer(timer_postscan); #endif /* icl_read_buffer(nodelist, CL_TRUE, sizeof(struct Node) * 6000, tree->nodelist, NULL, NULL); printf("node: %d, left %d, right %d", tree->nodelist[0].particlesHigh - tree->nodelist[0].particlesLow, tree->nodelist[1].particlesHigh - tree->nodelist[1].particlesLow, tree->nodelist[2].particlesHigh - tree->nodelist[2].particlesLow); printBox(tree->nodelist[49].bounding_box); printBox(tree->nodelist[53].bounding_box); printBox(tree->nodelist[54].bounding_box); for(int i = 0; i < 6000; ++i) if(tree->nodelist[i].bounding_box.box[0].x != 0.0) printBox(tree->nodelist[i].bounding_box); */ //small nodes stage // preprocessSmallNodes(smalllist); icl_release_buffers(3, activelist, chunks, bboxes); icl_kernel* sasl = icl_create_kernel(dev, "kernel/swapActiveAndSmalllist.cl", "swapActiveAndSmalllist", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* ssn = icl_create_kernel(dev, "kernel/splitSmallNodes.cl", "splitSmallNodes", KERNEL_BUILD_MACRO, ICL_SOURCE); #if timing == 1 icl_timer* timer_ssn = icl_init_timer(ICL_MILLI); icl_timer* timer_sasl = icl_init_timer(ICL_MILLI); icl_timer* timer_rsn = icl_init_timer(ICL_MILLI); #endif size_t localSize8 = 1; size_t globalSize8 = 1; UINT setMaxLevel = 0; icl_run_kernel(sasl, 1, &globalSize8, &localSize8, NULL, NULL, 2, (size_t)0, (void *)sizes, sizeof(UINT), &level); // get number of small nodes icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL); while(activeN != 0) { icl_start_timer(timer); // compute SVH and determine the split plane size_t localSize9 = 256; size_t globalSize9 = ((activeN + 255) / 256) * 256; #if timing == 1 icl_start_timer(timer_ssn); #endif icl_run_kernel(ssn, 1, &globalSize9, &localSize9, NULL, NULL, 5, (size_t)0, (void *)nodelist, (size_t)0, (void *)smalllist, (size_t)0, (void *)nextlist, (size_t)0, (void *)particlesD, (size_t)0, (void *)sizes); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_ssn); #endif size_t localSizeA = 1; size_t globalSizeA = 1; #if timing == 1 icl_start_timer(timer_sasl); #endif icl_run_kernel(sasl, 1, &globalSizeA, &localSizeA, NULL, NULL, 2, (size_t)0, (void *)sizes, sizeof(UINT), &setMaxLevel); #if timing == 1 icl_finish(dev); icl_stop_timer(timer_sasl); #endif swap(&nextlist, &smalllist); // read size of next activelist set in kernel #if timing == 1 icl_start_timer(timer_rsn); #endif icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL); //printf("small size %d\n", activeN); icl_finish(dev); #if timing == 1 icl_stop_timer(timer_rsn); #endif time = icl_stop_timer(timer); } icl_release_buffer(smalllist); icl_release_buffer(nextlist); icl_release_kernel(sasl); icl_release_kernel(ssn); #if timing == 1 printf("ssn %f\nsasl %f\nrsn %f\n\n", timer_ssn->current_time, timer_sasl->current_time, timer_rsn->current_time); icl_release_timer(timer_ssn); icl_release_timer(timer_sasl); icl_release_timer(timer_rsn); #endif UINT s[5]; icl_read_buffer(sizes, CL_TRUE, sizeof(UINT) * 5, &s, NULL, NULL); icl_release_buffer(sizes); icl_kernel* upPass = icl_create_kernel(dev, "kernel/upPass.cl", "upPass", KERNEL_BUILD_MACRO, ICL_SOURCE); icl_kernel* downPass = icl_create_kernel(dev, "kernel/kdDownPass.cl", "downPass", KERNEL_BUILD_MACRO, ICL_SOURCE); #if timing == 1 icl_timer* timer_upPass = icl_init_timer(ICL_MILLI); icl_timer* timer_downPass = icl_init_timer(ICL_MILLI); icl_timer* timer_rt = icl_init_timer(ICL_MILLI); #endif UINT treeHeight = s[4]; printf("Tree height: %d\n", treeHeight); size_t localSizeB = 256; size_t globalSizeB = ((nNodes + 255) / 256) * 256; icl_start_timer(timer); #if timing == 1 icl_start_timer(timer_upPass); #endif for(int l = (int)treeHeight; l >= 0; --l) { icl_run_kernel(upPass, 1, &globalSizeB, &localSizeB, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)particlesD, sizeof(int), &l, sizeof(UINT), &nNodes); } #if timing == 1 icl_finish(dev); icl_stop_timer(timer_upPass); icl_start_timer(timer_downPass); #endif for(UINT l = 0; l <= treeHeight; ++l) { icl_run_kernel(downPass, 1, &globalSizeB, &localSizeB, NULL, NULL, 4, (size_t)0, (void *)nodelist, (size_t)0, (void *)treeD, sizeof(UINT), &l, sizeof(UINT), &nNodes); } icl_finish(dev); #if timing == 1 icl_stop_timer(timer_downPass); #endif time = icl_stop_timer(timer); icl_release_kernel(upPass); icl_release_kernel(downPass); #if timing == 1 icl_start_timer(timer_rt); #endif #if timing == 1 icl_finish(dev); icl_stop_timer(timer_rt); printf("upPass %f\ndownPass %f\nread Tree %f\n\n", timer_upPass->current_time, timer_downPass->current_time, timer_rt->current_time); icl_release_timer(timer_upPass); icl_release_timer(timer_downPass); icl_release_timer(timer_rt); #endif // struct Node* kdTree = (struct Node*)malloc(sizeof(struct Node) * nNodes); // icl_read_buffer(treeD, CL_TRUE, sizeof(struct Node) * nNodes, kdTree, NULL, NULL); // printf("%d", tree->nodelist[0].left_child); printf("\nTime: %f\n", time); icl_release_timer(timer); return treeHeight; }