// backproject from 2D to 3D for a single image int bckpj3(Vec3i volsize, int nrays, int nnz, float *dm, Vec3i origin, int ri, int *ptrs, int *cord, float *x, float *y) { int i, j, iqx,iqy, xc, yc, zc; float xb, yb, dx, dy, dx1m, dy1m, dxdy; int status = 0; int xcent = origin[0]; int ycent = origin[1]; int zcent = origin[2]; int nx = volsize[0]; int ny = volsize[1]; // Phi: adding the shift parameters that get passed in as the last two entries of dm float sx, sy; sx = dm(7); sy = dm(8); if ( nx > 2*ri) { for (i = 1; i <= nrays; i++) { zc = cord(1,i) - zcent; yc = cord(2,i) - ycent; xc = cord(3,i) - xcent; xb = zc*dm(1)+yc*dm(2)+xc*dm(3) + xcent + sx; yb = zc*dm(4)+yc*dm(5)+xc*dm(6) + ycent + sy; for (j = ptrs(i); j <ptrs(i+1); j++) { iqx = ifix(xb); iqy = ifix(yb); dx = xb - iqx; dy = yb - iqy; dx1m = 1.0 - dx; dy1m = 1.0 - dy; dxdy = dx*dy; /* c y(j) = y(j) + dx1m*dy1m*x(iqx , iqy) c & + dx1m*dy *x(iqx , iqy+1) c & + dx *dy1m*x(iqx+1, iqy) c & + dx *dy *x(iqx+1, iqy+1) c c --- faster version of the above commented out c code (derived by summing the following table c of coefficients along the colunms) --- c c 1 dx dy dxdy c ------ -------- -------- ------- c x(i,j) -x(i,j) -x(i,j) x(i,j) c x(i,j+1) -x(i,j+1) c x(i+1,j) -x(i+1,j) c x(i+1,j+1) c */ // Phi: add index checking, now that shifts are being used if ( iqx <= nx && iqy <= ny && iqx >= 1 && iqy >= 1 ) { y(j) += x(iqx,iqy); if ( iqx + 1 <= nx && iqx + 1 >= 1 ) { y(j) += dx*(-x(iqx,iqy)+x(iqx+1,iqy)); } if ( iqy + 1 <= ny && iqy + 1 >= 1 ) { y(j) += dy*(-x(iqx,iqy)+x(iqx,iqy+1)); } if ( iqx + 1 <= nx && iqy + 1 <= ny && iqx + 1 >= 1 && iqy + 1 >= 1 ) { y(j) += dxdy*( x(iqx,iqy) - x(iqx,iqy+1) -x(iqx+1,iqy) + x(iqx+1,iqy+1) ); } } // y(j) += x(iqx,iqy) // + dx*(-x(iqx,iqy)+x(iqx+1,iqy)) // + dy*(-x(iqx,iqy)+x(iqx,iqy+1)) // + dxdy*( x(iqx,iqy) - x(iqx,iqy+1) // -x(iqx+1,iqy) + x(iqx+1,iqy+1) ); xb += dm(1); yb += dm(4); } // end for j } // end for i } else { fprintf(stderr, "bckpj3: nx must be greater than 2*ri\n"); } return status; }
pointer at(std::size_t n)const{return ptrs()+n;}
// project from 3D to 2D (single image) int fwdpj3(Vec3i volsize, int nrays, int nnz, float *dm, Vec3i origin, int ri, int *ptrs, int *cord, float *x, float *y) { /* purpose: y <--- proj(x) input : volsize the size (nx,ny,nz) of the volume nrays number of rays within the compact spherical representation nnz number of voxels within the sphere dm an array of size 9 storing transformation associated with the projection direction origin coordinates of the center of the volume ri radius of the sphere ptrs the beginning address of each ray cord the coordinates of the first point in each ray x 3d input volume y 2d output image */ int iqx, iqy, i, j, xc, yc, zc; float ct, dipx, dipy, dipx1m, dipy1m, xb, yb, dm1, dm4; int status = 0; // Phi: adding the shift parameters that get passed in as the last two entries of dm float sx, sy; sx = dm(7); sy = dm(8); int xcent = origin[0]; int ycent = origin[1]; int zcent = origin[2]; int nx = volsize[0]; int ny = volsize[1]; dm1 = dm(1); dm4 = dm(4); if ( nx > 2*ri ) { for (i = 1; i <= nrays; i++) { zc = cord(1,i)-zcent; yc = cord(2,i)-ycent; xc = cord(3,i)-xcent; xb = zc* dm(1) +yc* dm(2) +xc* dm(3) + xcent + sx; yb = zc* dm(4) +yc* dm(5) +xc* dm(6) + ycent + sy; for (j = ptrs(i); j< ptrs(i+1); j++) { iqx = ifix(xb); iqy = ifix(yb); ct = x(j); // dipx = xb - (float)(iqx); // dipy = (yb - (float)(iqy)) * ct; dipx = xb - iqx; dipy = (yb - iqy) * ct; dipy1m = ct - dipy; dipx1m = 1.0 - dipx; if (iqx <= nx && iqy <= ny && iqx >= 1 && iqy >= 1) // y(iqx ,iqy) = y(iqx ,iqy) + dipx1m*dipy1m; y(iqx ,iqy) += dipx1m*dipy1m; if (iqx + 1 <= nx && iqy <= ny && iqx >= 0 && iqy >= 1) // y(iqx+1,iqy) = y(iqx+1,iqy) + dipx*dipy1m; y(iqx+1,iqy) += dipx*dipy1m; if (iqx + 1 <= nx && iqy + 1 <= ny && iqx >= 0 && iqy >= 0) // y(iqx+1,iqy+1) = y(iqx+1,iqy+1) + dipx*dipy; y(iqx+1,iqy+1) += dipx*dipy; if (iqx <= nx && iqy + 1 <= ny && iqx >= 1 && iqy >= 0) // y(iqx ,iqy+1) = y(iqx ,iqy+1) + dipx1m*dipy; y(iqx ,iqy+1) += dipx1m*dipy; xb += dm1; yb += dm4; } } } else { fprintf(stderr, " nx must be greater than 2*ri\n"); exit(1); } return status; }
pointer begin()const{return ptrs();}
pointer end()const{return ptrs()+size_;}
void selection_properties_t::notify_save_inline_edit(const char * value) { static_api_ptr_t<metadb_io_v2> tagger_api; if (strcmp(value, "<mixed values>")) { pfc::list_t<pfc::string8> values; const char *ptr = value, *start = ptr; while (*ptr) { start = ptr; while (*ptr != ';' && *ptr) ptr++; values.add_item(pfc::string8(start, ptr - start)); while (*ptr == ' ' || *ptr == ';') ptr++; } t_size j, value_count = values.get_count(); metadb_handle_list ptrs(m_edit_handles); pfc::list_t<file_info_impl> infos; pfc::list_t<bool> mask; pfc::list_t<const file_info *> infos_ptr; t_size i, count = ptrs.get_count(); mask.set_count(count); infos.set_count(count); //infos.set_count(count); for (i = 0; i < count; i++) { assert(ptrs[i].is_valid()); mask[i] = !ptrs[i]->get_info(infos[i]); infos_ptr.add_item(&infos[i]); if (!mask[i]) { pfc::string8 old_value; g_print_field(m_edit_field, infos[i], old_value); if (!(mask[i] = !((strcmp(old_value, value))))) { infos[i].meta_remove_field(m_edit_field); for (j = 0; j < value_count; j++) infos[i].meta_add(m_edit_field, values[j]); } } } infos_ptr.remove_mask(mask.get_ptr()); ptrs.remove_mask(mask.get_ptr()); { service_ptr_t<file_info_filter_impl> filter = new service_impl_t<file_info_filter_impl>(ptrs, infos_ptr); tagger_api->update_info_async(ptrs, filter, GetAncestor(get_wnd(), GA_ROOT), metadb_io_v2::op_flag_no_errors | metadb_io_v2::op_flag_background | metadb_io_v2::op_flag_delay_ui, NULL); } } /*if (m_edit_index < m_fields.get_count()) { (m_edit_column ? m_fields[m_edit_index].m_name : m_fields[m_edit_index].m_name_friendly) = value; pfc::list_t<t_list_view:: t_item_insert> items; items.set_count(1); items[0].m_subitems.add_item(m_fields[m_edit_index].m_name_friendly); items[0].m_subitems.add_item(m_fields[m_edit_index].m_name); replace_items(m_edit_index, items); }*/ m_edit_column = pfc_infinite; m_edit_index = pfc_infinite; m_edit_field.reset(); m_edit_handles.remove_all(); }
value_type* at(std::size_t n)const{return &ptrs()[n];}
value_type* end()const{return &ptrs()[size_];}
value_type* begin()const{return &ptrs()[0];}
int main( int argc, char *argv[] ) { #ifndef NOCATCH try { #endif // Initialize geometry and input/output buffers TestCase testCase( argc, argv ); #ifndef PHYSICS int numInput = testCase.xres * testCase.yres; #else int numInput = testCase.getSize(); #endif const int numOutput = numInput; int number_of_increments = 5; const char *BINARY_FILE_NAME = "prog.ptx"; const char *SOURCE_FILE_NAME = "prog.cl"; // Command line handling bool isBinary = false; for ( int i=1; i<argc; ++i ) { const std::string arg( argv[i] ); if ( arg == "-b" ) isBinary = true; /*else throw std::runtime_error( "Invalid option "+arg );*/ } // Load or compile program std::cerr << (isBinary ? "Loading" : "Compiling") << "..."; //EDIT: Removing all of his clock functions, can use the VS Porfiler if necessary. //const my_clock_t tc0 = my_clock(); CL::SingleFileSingleGPUSetup gpuSetup( isBinary ? BINARY_FILE_NAME : SOURCE_FILE_NAME, isBinary, COMPILER_FLAGS ); //const my_clock_t tc1 = my_clock(); std::cout<<"SETUP complete"<<std::endl; //std::cerr << "done in " << tdiff(tc0,tc1) << " seconds\n\n"; #ifdef VERBOSE std::cerr << " ---- Build log\n" << gpuSetup.getBuildLog() << "\n"; #endif // Save "binary" (PTX bytecode) for reuse if ( !isBinary ) { std::ofstream f( BINARY_FILE_NAME ); gpuSetup.writeBinary( f ); } //EDIT: Changing code to get rid of kernel getptr //1) Define new kernel relocate //2) Use function getPtrs to get the vector ptrs and use it to define size (ptrs.size()) and offset (sizeof(int)) on host. //3) Allocate some mem on GPU to store ptrs and use it in the relocate kernel. //4) Move the enqueue write here so that the WorldVolumePointer is defined. //5) As soon as the kernel is done, free the memory used for ptrs. // Import handles to OpenCL kernels (functions) CL::Kernel kernelTrace( gpuSetup, "trace" ); // Kernel trace is the main kernel which does the navigation. CL::Kernel kernelRelocate( gpuSetup, "relocate" ); // The kernel version of relocate. It is possible to run relocate without using the kernel straight on the host. CL::Kernel kernelTest( gpuSetup, "test"); // Kernel to check for inconsistencies. //EDIT: New kernel check CL::Kernel kernelCheck ( gpuSetup, "check"); //EDIT2 CL::Kernel kernelCheckGeometry(gpuSetup, "checkgeom"); // Kernel to confirm gemoetry relocation happened as it should. // Reserve GPU & host buffers //EDIT Getting size of ptrs int size = testCase.geom->ptrs_size(); //REMOVE: std::cout<<"Size of ptrs is = "<<size<<std::endl; int size_of_logical_checks = 1000; const int auxBufSz = sizeof(cl_mem); // Page-locked buffers for fast DMA-IO CL::PinnedBufferPair gpuInput( gpuSetup, numInput*sizeof(StubParticle), CL_MEM_READ_WRITE, CL_MAP_WRITE ); CL::PinnedBufferPair gpuOutput( gpuSetup, numOutput*sizeof(G4double), CL_MEM_WRITE_ONLY, CL_MAP_READ ); CL::PinnedBufferPair gpuAux( gpuSetup, auxBufSz, CL_MEM_READ_WRITE, CL_MAP_READ | CL_MAP_WRITE ); //EDIT CL::PinnedBufferPair ptrs( gpuSetup, size*2*sizeof(int), CL_MEM_READ_WRITE, CL_MAP_WRITE ); CL::PinnedBufferPair result( gpuSetup,size_of_logical_checks*sizeof(cl_mem), CL_MEM_WRITE_ONLY, CL_MAP_READ ); #if (GLOBAL_MODE ==1) // sizeof( size_t ) ? CL::Buffer Numbers_Of_Solid( gpuSetup, CL_MEM_READ_WRITE, numInput*sizeof(int)); CL::Buffer Sum_Of_Solid( gpuSetup, CL_MEM_READ_WRITE, numInput*sizeof(int)); CL::Buffer Solids( gpuSetup, CL_MEM_READ_WRITE, numInput*sizeof(SolidInfo)); CL::Buffer Result_For_Current_Solid( gpuSetup, CL_MEM_READ_WRITE, numInput*sizeof(ResultInfo)); CL::Buffer Compacter_Result( gpuSetup, CL_MEM_READ_WRITE, numInput*sizeof(FinalResult)); #endif CL::Buffer nullVNode( gpuSetup, CL_MEM_READ_WRITE, 2 *sizeof(G4SmartVoxelNode )); CL::Buffer noStepArray ( gpuSetup, CL_MEM_READ_WRITE, numInput*sizeof ( bool )); CL::Buffer LocationArray( gpuSetup, CL_MEM_READ_WRITE, numInput * sizeof( PointInformation)); std::cout<<"Pinned Buffers allocation complete"<<std::endl; // GPU only buffers //EDIT CL::Buffer gpuGeom( gpuSetup, CL_MEM_READ_WRITE, testCase.geom->size()); //EDIT2: //gpuSetup.enqueueWriteBuffer( gpuGeom, testCase.geom->getBuffer() ); std::cout<<"Device Buffers allocation complete"<<std::endl; std::memcpy( ptrs.getHostPtr(), &(testCase.geom->ptrs[0]), size*2*sizeof(GEOMTYPE) ); //EDIT 2: //check_navigation(ptrs.getHostPtr(), size); ptrs.transferToDevice(); gpuSetup.finish(); //EDIT kernelTest.setArg(0, result.getDeviceBuffer()); if( GLOBAL_MODE == 1) kernelTest.setArg(1, noStepArray); gpuSetup.enqueueKernel( kernelTest, 8, 8); gpuSetup.finish(); result.transferFromDevice(); gpuSetup.finish(); FinalResult * final; int * a = ( int *)(ptrs.getHostPtr()); std::cout<<"Printing input: \n"; for (int i=0; i<8; i++) std::cout<< a[i] << " "; //EDIT : Printing the output array std::cout<<"Printing output: \n"; for (int i=0; i<8; i++) std::cout<< (( int *)(result.getHostPtr()))[i] << " "; //EDIT: Changed kernel Test to fix Prefix Sum //std::cout<< " Values that were returned: "; //std::cout<<" For thread 1: "; //final = (FinalResult *)result.getHostPtr(); //final += sizeof( FinalResult); //std::cout<<": Min. Step value = "<< final->step<< " and safety returned = "<< final->safety << std::endl; // MODIFY: have to loop through this and add as a check /* for( int i =0 ; i < 4; i++) { std::cout<<" For thread "<< i; final = (FinalResult **)result.getHostPtr(); std::cout<<": Min. Step value = "<< final[i]->step<< " and safety returned = "<< final[i]->safety << std::endl; } */ /*std::cout<<"\nOriginal values: "; for( int i =0 ; i<32; i++) { std::cout << ((int *)ptrs.getHostPtr())[i]<<" "; } */ //NOTE: kernelCheck is badly named. It was originally the replacement for kernel getPtr and was used to return the geometry start location on the GPU. // IT can also be used to check the sizes on CPU and GPu are consistent kernelCheck.setArg( 0 , gpuGeom); kernelCheck.setArg( 1, result.getDeviceBuffer()); gpuSetup.enqueueKernel(kernelCheck, 1, 1); gpuSetup.finish(); //EDIT result.transferFromDevice(); gpuSetup.finish(); std::cout<< "On the CPU, size of GEOMTYPE = " << sizeof( GEOMTYPE )<<"\n"; //REMOVE: std::cout<<"Size of GEOMTYPE on GPU -> "<<*((int *)result.getHostPtr())<<std::endl; //MODIFY: // Assert that these are equal here. //EDIT2: //int answer = *(int*)(result.getHostPtr()); //std::cout<<"Result before is "<< answer<< std::endl; std::cout<< "About to run relocate, no problem so far\n"; //EDIT2 /* gpuSetup.enqueueWriteBuffer( gpuGeom, testCase.geom->getBuffer() ); gpuSetup.finish(); kernelRelocate.setArg( 0, ptrs.getDeviceBuffer()); kernelRelocate.setArg (1, gpuGeom); kernelRelocate.setArg(2, sizeof(int), &size); //MODIFY gpuSetup.enqueueKernel(kernelRelocate,size*2,2); gpuSetup.finish(); */ //OLD // Fetch address of gpuGeom in device memory space (kludge) //kernelGetPtr.setArg( 0, gpuGeom ); //kernelGetPtr.setArg( 1, gpuAux.getDeviceBuffer() ); //gpuSetup.enqueueTask( kernelGetPtr ); //gpuAux.transferFromDevice( CL_TRUE, 0, sizeof(cl_mem) ); //cl_mem gpuhandle = *(cl_mem*)gpuAux.getHostPtr(); //EDIT2 cl_mem gpuhandle = *(cl_mem*)result.getHostPtr(); //OLD:EDIT2 testCase.geom->relocate( gpuhandle ); //EDIT2 gpuSetup.enqueueWriteBuffer( gpuGeom, testCase.geom->getBuffer() ); //EDIT2: Kernel which returns the checks if(CHECK == 2 || CHECK == 4) { kernelCheckGeometry.setArg( 0 , gpuGeom); kernelCheckGeometry.setArg( 1 , result.getDeviceBuffer()); kernelCheckGeometry.setArg( 2, sizeof(int), &number_of_increments); gpuSetup.enqueueKernel(kernelCheckGeometry, 1, 1); gpuSetup.finish(); result.transferFromDevice(); gpuSetup.finish(); check_navigation( result.getHostPtr(), testCase.geom->VolumeStore, number_of_increments); // print for Geometry test; } // MODIFY: He also notes how this is not the optimal way of doing this. Memory can be saved here. std::memcpy( gpuInput.getHostPtr(), &(testCase.input[0]), gpuInput.size() ); //REMOVE std::cout<<"About to run trace\n"; // THis part was written for test purpose to see if error was caused due to shared memory // This kernel is getting to have WAY too many arguments gpuInput.transferToDevice(); gpuSetup.finish(); // Set GPU kernel arguments kernelTrace.setArg( 0, gpuInput.getDeviceBuffer() ); kernelTrace.setArg( 1, gpuOutput.getDeviceBuffer() ); kernelTrace.setArg( 2, gpuGeom ); kernelTrace.setArg( 3, sizeof(G4double), &(testCase.phys_step) ); kernelTrace.setArg( 4, sizeof(cl_int), &numInput ); #ifdef CHECK kernelTrace.setArg( 5, result.getDeviceBuffer()); #endif // Two uses -: One for debugging and one for checking. Remove at some point. if( CHECK == 1 || CHECK == 4) kernelTrace.setArg( 5, result.getDeviceBuffer()); #if( GLOBAL_MODE ==1) //kernelTrace.setArg( 6, Numbers_Of_Solid ); //kernelTrace.setArg( 7, Sum_Of_Solid ); kernelTrace.setArg( 6, Solids ); kernelTrace.setArg( 7, Result_For_Current_Solid ); kernelTrace.setArg( 8, Compacter_Result ); kernelTrace.setArg( 9, nullVNode); #endif /* NOTE: The current way of setting kernel trace's arguments is bad and is bound to cause problems in future Change the implementation to either removes one check or perhaps just replace the existing check with something more useful. */ std::cout<<"Arguments set and value of Physical step sent on CPU is = "<< (testCase.phys_step)<<std::endl; // Write input to GPU memory // const my_clock_t t1 = my_clock(); //OLD /*gpuSetup.enqueueWriteBuffer( gpuGeom, testCase.geom->getBuffer() );*/ std::cout<< "Write complete, transfer done, finish\n"; // Actual execution //const my_clock_t t2 = my_clock(); //EDIT //gpuSetup.enqueueKernel( kernelTrace, numInput, blockSize ); gpuSetup.enqueueKernel( kernelTrace, BlockSize, BlockSize ); gpuSetup.finish(); std::cout<<"Kernel trace done\n"; //EDIT2: result.transferFromDevice(); gpuSetup.finish(); if( CHECK==1 || CHECK==4) check_navigation( result.getHostPtr(), testCase.geom->VolumeStore, 0); // Run distance check // Transfer results back to host memory //const my_clock_t t3 = my_clock(); gpuOutput.transferFromDevice(); gpuSetup.finish(); //for (int i=0; i<10; i++) //std::cout<< (( int *)(result.getHostPtr()))[i] << " "; std::cout<< "\n"; for (int i=0; i<32; i++) std::cout<< (( G4double *)(result.getHostPtr()))[i] << " "; std::cout<<"From CPU -> the first particles position and direction are -" <<" X-: "<<testCase.input[0].pos.x<<" Y-: "<<testCase.input[0].pos.y<<" Z-: "<<testCase.input[0].pos.z; std::cout<<" \n and directions are -: X-: "<<testCase.input[0].dir.x<<" Y-: "<<testCase.input[0].dir.y<<" Z-: "<<testCase.input[0].dir.z; //const my_clock_t t4 = my_clock(); // Print time summary //std::cerr << "Elapsed: " << tdiffms(t1,t4) << " ms" //<< "\n Transfer: " << tdiffms( t1, t2 )+tdiffms(t3,t4) //<< "\n\tto GPU:\t" << tdiffms( t1, t2 ) //<< "\n\tfrom GPU:\t" << tdiffms( t3, t4 ) //<< "\n Calculation: " << tdiffms( t2, t3 ) << "\n\n"; //MODIFY // Output results (also a stupid copy) std::memcpy( &(testCase.output[0]), gpuOutput.getHostPtr(), gpuOutput.size() ); testCase.outputData( "imggpu.txt" ); return EXIT_SUCCESS; #ifndef NOCATCH } catch ( const std::runtime_error &e ) { std::cerr << e.what() << std::endl; return EXIT_FAILURE; } #endif }