//-------------------------------------------------------------------------------- SFString CSlurperApp::getFormatString(COptions& options, const SFString& which) { if (which == "file") buildDisplayStrings(options); SFString errMsg; SFString formatName = "fmt_" + options.exportFormat + "_" + which; SFString ret = config.GetProfileStringGH("DISPLAY_STR", formatName, EMPTY); if (ret.Contains("file:")) { SFString file = ret.Substitute("file:",EMPTY); if (!SFos::fileExists(file)) errMsg = SFString("Formatting file '") + file + "' for display string '" + formatName + "' not found. Quiting...\n"; else ret = asciiFileToString(file); } else if (ret.Contains("fmt_")) // it's referring to another format string... { SFString newName = ret; ret = config.GetProfileStringGH("DISPLAY_STR", newName, EMPTY); formatName += ":"+newName; } ret = ret.Substitute("\\n","\n").Substitute("\\t","\t"); // some sanity checks if (countOf('{',ret) != countOf('}',ret) || countOf('[',ret) != countOf(']',ret)) { errMsg = SFString("Mismatched brackets in display string '") + formatName + "': '" + ret + "'. Quiting...\n"; } else if (ret.IsEmpty()) { errMsg = SFString("Empty display string '") + formatName + "'. Quiting...\n"; } if (!errMsg.IsEmpty()) { outErr << errMsg; exit(0); } return ret; }
bool str::isFloat() const { const int ourLen = getLen(); bool strIsAFloat = countOf(".") <= 1; if(strIsAFloat) { for(int i = mpStr[0] == '-' ? 1 : 0; i < ourLen; i++) { const char thisChar = mpStr[i]; if(thisChar != '.' && ! isdigit(thisChar)) { strIsAFloat = false; break; } } } return strIsAFloat; }
void evaluateAllCells(){ int remainingFormulas; do{ int i,j; for(i=0;i<Height;i++){ for(j=0;j<Width;j++){ if( *(*(cellsTypes+i)+j) == 'f' ){ int evaluatedValue; int success = evaluate((*(inputTable+i))+j,&evaluatedValue); if(success!=1) continue; *(*(evaluated+i)+j)=evaluatedValue; *(*(cellsTypes+i)+j)='e'; // printf("\n[%d][%d]=%d\n",i,j,evaluatedValue); } } } remainingFormulas=countOf(cellsTypes,'f'); }while(remainingFormulas>0); }
std::basic_string<TCHAR> commatize (T number) { static TCHAR scratch [8*sizeof(T)]; register TCHAR * ptr = scratch + countOf( scratch ); *(--ptr) = 0; for (int digits = 3; ; ) { *(--ptr) = '0' + int (number % 10); number /= 10; if (0 == number) break; if (--digits <= 0) { *(--ptr) = ','; digits = 3; } } return std::basic_string<TCHAR> (ptr); }
int main() { Solution s; { int A[] = {0}; s.sortColors(A, 0); printResult(A, 0); } { int A[] = { 0 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 0, 1 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 0, 2 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 1, 2 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 0, 1, 2 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 0, 1, 2 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 0, 2, 2 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 2, 0, 2 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 2, 2, 0 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 2, 1, 2, 0 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 2, 1, 2, 0, 1 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 2, 1, 2, 0, 1, 0, 2 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } { int A[] = { 2, 0, 1, 1, 2, 1, 2, 1, 2, 0, 1, 0, 2 }; s.sortColors(A, countOf(A)); printResult(A, countOf(A)); } }
int main( int argc, char **argv ) { // Define MEMORYREPORT on windows platfroms to enable debug memory heap checking #if defined( MEMORYREPORT ) && defined( _WIN32 ) TCHAR logPath[ MAX_PATH ]; ::GetCurrentDirectory( MAX_PATH, logPath ); ::_tcscat_s( logPath, _T( "\\MemoryReport.txt") ); // We leak the handle to this file, on purpose, so that the ::_CrtSetReportFile() can output it's memory // statistics on app shutdown HANDLE hLogFile; hLogFile = ::CreateFile( logPath, GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL ); ::_CrtSetReportMode( _CRT_ASSERT, _CRTDBG_MODE_FILE | _CRTDBG_MODE_WNDW | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportMode( _CRT_ERROR, _CRTDBG_MODE_FILE | _CRTDBG_MODE_WNDW | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportMode( _CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportFile( _CRT_ASSERT, hLogFile ); ::_CrtSetReportFile( _CRT_ERROR, hLogFile ); ::_CrtSetReportFile( _CRT_WARN, hLogFile ); int tmp = ::_CrtSetDbgFlag( _CRTDBG_REPORT_FLAG ); tmp |= _CRTDBG_LEAK_CHECK_DF | _CRTDBG_ALLOC_MEM_DF | _CRTDBG_CHECK_ALWAYS_DF; ::_CrtSetDbgFlag( tmp ); // By looking at the memory leak report that is generated by this debug heap, there is a number with // {} brackets that indicates the incremental allocation number of that block. If you wish to set // a breakpoint on that allocation number, put it in the _CrtSetBreakAlloc() call below, and the heap // will issue a bp on the request, allowing you to look at the call stack // ::_CrtSetBreakAlloc( 997 ); #endif /* MEMORYREPORT */ // Declare the supported options. po::options_description desc( "clFFT Runtime Test command line options" ); desc.add_options() ( "help,h", "produces this help message" ) ( "verbose,v", "print out detailed information for the tests" ) ( "noVersion", "Don't print version information from the clFFT library" ) ( "noInfoCL", "Don't print information from the OpenCL runtime" ) ( "cpu,c", "Run tests on a CPU device" ) ( "gpu,g", "Run tests on a GPU device (default)" ) ( "pointwise,p", "Do a pointwise comparison to determine test correctness (default: use root mean square)" ) ( "tolerance,t", po::value< float >( &tolerance )->default_value( 0.001f ), "tolerance level to use when determining test pass/fail" ) ( "numRandom,r", po::value< size_t >( &number_of_random_tests )->default_value( 2000 ), "number of random tests to run" ) ( "seed", po::value< time_t >( &random_test_parameter_seed )->default_value( time(NULL)%1308000000 ), "seed to use for the random test. defaults to time(NULL)" ) // modulo lops off the first few digits of the time value to make the seed easier to type // even without these digits, the seed value won't wrap around until 2036 or later ( "short,s", "Run radix 2 tests; no random testing" ) ( "medium,m", "Run all radices; no random testing" ) ; // Parse the command line options, ignore unrecognized options and collect them into a vector of strings po::variables_map vm; po::parsed_options parsed = po::command_line_parser( argc, argv ).options( desc ).allow_unregistered( ).run( ); po::store( parsed, vm ); po::notify( vm ); std::vector< std::string > to_pass_further = po::collect_unrecognized( parsed.options, po::include_positional ); std::cout << std::endl; size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0) | ((vm.count( "cpu" ) > 0) ? 2 : 0); if ((mutex & (mutex-1)) != 0) { terr << _T("You have selected mutually-exclusive OpenCL device options:") << std::endl; if (vm.count ( "cpu" ) > 0) terr << _T(" cpu, c Run tests on a CPU device" ) << std::endl; if (vm.count ( "gpu" ) > 0) terr << _T(" gpu, g Run tests on a GPU device" ) << std::endl; return 1; } if( vm.count( "cpu" ) ) { device_type = CL_DEVICE_TYPE_CPU; } if( vm.count( "gpu" ) ) { device_type = CL_DEVICE_TYPE_GPU; device_gpu_list = ~0; } // Print version by default if( !vm.count( "noVersion" ) ) { const int indent = countOf( "clFFT client API version: " ); tout << std::left << std::setw( indent ) << _T( "clFFT client API version: " ) << clfftVersionMajor << _T( "." ) << clfftVersionMinor << _T( "." ) << clfftVersionPatch << std::endl; cl_uint libMajor, libMinor, libPatch; clfftGetVersion( &libMajor, &libMinor, &libPatch ); tout << std::left << std::setw( indent ) << _T( "clFFT runtime version: " ) << libMajor << _T( "." ) << libMinor << _T( "." ) << libPatch << std::endl << std::endl; } // Print clInfo by default if( !vm.count( "noInfoCL" ) ) { cl_context tempContext = NULL; cl_command_queue tempQueue = NULL; cl_event tempEvent = NULL; std::vector< cl_device_id > device_id = ::initializeCL( device_type, device_gpu_list, tempContext, true ); ::cleanupCL( &tempContext, &tempQueue, 0, NULL, 0, NULL, &tempEvent ); } if( vm.count( "help" ) ) { std::cout << desc << std::endl; return 0; } if( vm.count( "verbose" ) ) { verbose = true; } else { verbose = false; } if( vm.count( "short" ) && vm.count( "medium" ) ) { terr << _T("Options 'short' and 'medium' are mutually-exclusive. Please select only one.") << std::endl; return 1; } // Create a new argc,argv to pass to InitGoogleTest // First parameter of course is the name of this program std::vector< const char* > myArgv; // Push back a pointer to the executable name if( argc > 0 ) myArgv.push_back( *argv ); // Push into our new argv vector any parameter the user passed, except to filter their gtest_filter expressions std::string userFilter; for( int i = 1; i < argc; ++i ) { if( vm.count( "short" ) || vm.count( "medium" ) ) { std::string tmpStr( argv[ i ] ); std::string::size_type pos = tmpStr.find( "gtest_filter" ); if( pos == std::string::npos ) { myArgv.push_back( argv[ i ] ); } else { // Capture the users filter, but only the regexp portion userFilter = argv[ i ]; userFilter.erase( 0, 15 ); } } else { myArgv.push_back( argv[ i ] ); } } std::string newFilter; if( vm.count( "short" ) ) { newFilter += "--gtest_filter=*accuracy_test_pow2*"; if( userFilter.size( ) ) { newFilter += ":"; newFilter += userFilter; } myArgv.push_back( newFilter.c_str( ) ); } if( vm.count( "medium" ) ) { newFilter += "--gtest_filter="; if( userFilter.size( ) ) { newFilter += userFilter; newFilter += ":"; } newFilter += "-*Random*"; myArgv.push_back( newFilter.c_str( ) ); } if( vm.count( "pointwise" ) ) { comparison_type = pointwise_compare; } else { comparison_type = root_mean_square; } int myArgc = static_cast< int >( myArgv.size( ) ); std::cout << "Result comparison tolerance is " << tolerance << std::endl; ::testing::InitGoogleTest( &myArgc, const_cast< char** >( &myArgv[ 0 ] ) ); return RUN_ALL_TESTS(); }
int _tmain( int argc, _TCHAR* argv[] ) { size_t length = 0; size_t iDevice = 0; size_t numLoops = 0; bool defaultDevice = true; try { // Declare the supported options. po::options_description desc( "AMP Scan command line options" ); desc.add_options() ( "help,h", "produces this help message" ) ( "version,v", "Print queryable version information from the Bolt AMP library" ) ( "ampInfo,i", "Print queryable information of the AMP runtime" ) ( "device,d", po::value< size_t >( &iDevice ), "Choose specific AMP device, otherwise system default (AMP choose)" ) ( "length,l", po::value< size_t >( &length )->default_value( 4096 ), "Specify the length of scan array" ) ( "profile,p", po::value< size_t >( &numLoops )->default_value( 1 ), "Time and report Scan speed GB/s (default: profiling off)" ) ; po::variables_map vm; po::store( po::parse_command_line( argc, argv, desc ), vm ); po::notify( vm ); if( vm.count( "version" ) ) { // TODO: Query Bolt for its version information size_t libMajor, libMinor, libPatch; libMajor = 0; libMinor = 0; libPatch = 1; const int indent = countOf( "Bolt version: " ); bolt::tout << std::left << std::setw( indent ) << _T( "Bolt version: " ) << libMajor << _T( "." ) << libMinor << _T( "." ) << libPatch << std::endl; } if( vm.count( "help" ) ) { // This needs to be 'cout' as program-options does not support wcout yet std::cout << desc << std::endl; return 0; } if( vm.count( "ampInfo" ) ) { concurrency::accelerator default_acc; std::wcout << std::left; std::wcout << std::setw( colWidth ) << _T( "Default device: " ) << default_acc.description << std::endl; std::wcout << std::setw( colWidth ) << _T( "Default device path: " ) << default_acc.device_path << std::endl << std::endl; //std::for_each( allDevices.begin( ), allDevices.end( ), printAccelerator ); std::vector< concurrency::accelerator > allDevices = concurrency::accelerator::get_all( ); for( unsigned int i = 0; i < allDevices.size( ); ++i ) printAccelerator( i, allDevices.at( i ) ); return 0; } if( vm.count( "device" ) ) { defaultDevice = false; } } catch( std::exception& e ) { bolt::terr << _T( "Bolt AMP error reported:" ) << std::endl << e.what() << std::endl; return 1; } // bolt::control::getDefault( ); std::vector< int > input( length, 1 ); bolt::statTimer& myTimer = bolt::statTimer::getInstance( ); myTimer.Reserve( 1, numLoops ); size_t reduceId = myTimer.getUniqueID( _T( "reduce" ), 0 ); for( unsigned i = 0; i < numLoops; ++i ) { myTimer.Start( reduceId ); int res = bolt::amp::reduce( input.begin( ), input.end( ), 0 ); myTimer.Stop( reduceId ); } // Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result size_t pruned = myTimer.pruneOutliers( 1.0 ); double scanTime = myTimer.getAverageTime( reduceId ); double scanGB = ( input.size( ) * sizeof( int ) ) / (1024.0 * 1024.0 * 1024.0); bolt::tout << std::left; bolt::tout << std::setw( colWidth ) << _T( "Reduce profile: " ) << _T( "[" ) << numLoops-pruned << _T( "] samples" ) << std::endl; bolt::tout << std::setw( colWidth ) << _T( " Size (GB): " ) << scanGB << std::endl; bolt::tout << std::setw( colWidth ) << _T( " Time (s): " ) << scanTime << std::endl; bolt::tout << std::setw( colWidth ) << _T( " Speed (GB/s): " ) << scanGB / scanTime << std::endl; bolt::tout << std::endl; // bolt::tout << myTimer; return 0; }
int _tmain( int argc, _TCHAR* argv[] ) { // This helps with mixing output of both wide and narrow characters to the screen std::ios::sync_with_stdio( false ); // Define MEMORYREPORT on windows platfroms to enable debug memory heap checking #if defined( MEMORYREPORT ) && defined( _WIN32 ) TCHAR logPath[ MAX_PATH ]; ::GetCurrentDirectory( MAX_PATH, logPath ); ::_tcscat_s( logPath, _T( "\\MemoryReport.txt") ); // We leak the handle to this file, on purpose, so that the ::_CrtSetReportFile() can output it's memory // statistics on app shutdown HANDLE hLogFile; hLogFile = ::CreateFile( logPath, GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL ); ::_CrtSetReportMode( _CRT_ASSERT, _CRTDBG_MODE_FILE | _CRTDBG_MODE_WNDW | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportMode( _CRT_ERROR, _CRTDBG_MODE_FILE | _CRTDBG_MODE_WNDW | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportMode( _CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportFile( _CRT_ASSERT, hLogFile ); ::_CrtSetReportFile( _CRT_ERROR, hLogFile ); ::_CrtSetReportFile( _CRT_WARN, hLogFile ); int tmp = ::_CrtSetDbgFlag( _CRTDBG_REPORT_FLAG ); tmp |= _CRTDBG_LEAK_CHECK_DF | _CRTDBG_ALLOC_MEM_DF | _CRTDBG_CHECK_ALWAYS_DF; ::_CrtSetDbgFlag( tmp ); // By looking at the memory leak report that is generated by this debug heap, there is a number with // {} brackets that indicates the incremental allocation number of that block. If you wish to set // a breakpoint on that allocation number, put it in the _CrtSetBreakAlloc() call below, and the heap // will issue a bp on the request, allowing you to look at the call stack // ::_CrtSetBreakAlloc( 1833 ); #endif /* MEMORYREPORT */ // OpenCL state cl_device_type deviceType = CL_DEVICE_TYPE_ALL; cl_int deviceId = 0; cl_int platformId = 0; // FFT state clfftResultLocation place = CLFFT_INPLACE; clfftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED; clfftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED; clfftPrecision precision = CLFFT_SINGLE; clfftDirection dir = CLFFT_FORWARD; size_t lengths[ 3 ] = {1,1,1}; size_t iStrides[ 4 ] = {0,0,0,0}; size_t oStrides[ 4 ] = {0,0,0,0}; cl_uint profile_count = 0; cl_uint command_queue_flags = 0; size_t batchSize = 1; // Initialize flags for FFT library std::auto_ptr< clfftSetupData > setupData( new clfftSetupData ); OPENCL_V_THROW( clfftInitSetupData( setupData.get( ) ), "clfftInitSetupData failed" ); try { // Declare the supported options. po::options_description desc( "clFFT client command line options" ); desc.add_options() ( "help,h", "produces this help message" ) ( "version,v", "Print queryable version information from the clFFT library" ) ( "clinfo,i", "Print queryable information of all the OpenCL runtimes and devices" ) ( "printChosen", "Print queryable information of the selected OpenCL runtime and device" ) ( "gpu,g", "Force selection of OpenCL GPU devices only" ) ( "cpu,c", "Force selection of OpenCL CPU devices only" ) ( "all,a", "Force selection of all OpenCL devices (default)" ) ( "platform", po::value< cl_int >( &platformId )->default_value( 0 ), "Select a specific OpenCL platform id as it is reported by clinfo" ) ( "device", po::value< cl_int >( &deviceId )->default_value( 0 ), "Select a specific OpenCL device id as it is reported by clinfo" ) ( "outPlace,o", "Out of place FFT transform (default: in place)" ) ( "double", "Double precision transform (default: single)" ) ( "inv", "Backward transform (default: forward)" ) ( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" ) ( "lenX,x", po::value< size_t >( &lengths[ 0 ] )->default_value( 1024 ), "Specify the length of the 1st dimension of a test array" ) ( "lenY,y", po::value< size_t >( &lengths[ 1 ] )->default_value( 1 ), "Specify the length of the 2nd dimension of a test array" ) ( "lenZ,z", po::value< size_t >( &lengths[ 2 ] )->default_value( 1 ), "Specify the length of the 3rd dimension of a test array" ) ( "isX", po::value< size_t >( &iStrides[ 0 ] )->default_value( 1 ), "Specify the input stride of the 1st dimension of a test array" ) ( "isY", po::value< size_t >( &iStrides[ 1 ] )->default_value( 0 ), "Specify the input stride of the 2nd dimension of a test array" ) ( "isZ", po::value< size_t >( &iStrides[ 2 ] )->default_value( 0 ), "Specify the input stride of the 3rd dimension of a test array" ) ( "iD", po::value< size_t >( &iStrides[ 3 ] )->default_value( 0 ), "input distance between subsequent sets of data when batch size > 1" ) ( "osX", po::value< size_t >( &oStrides[ 0 ] )->default_value( 1 ), "Specify the output stride of the 1st dimension of a test array" ) ( "osY", po::value< size_t >( &oStrides[ 1 ] )->default_value( 0 ), "Specify the output stride of the 2nd dimension of a test array" ) ( "osZ", po::value< size_t >( &oStrides[ 2 ] )->default_value( 0 ), "Specify the output stride of the 3rd dimension of a test array" ) ( "oD", po::value< size_t >( &oStrides[ 3 ] )->default_value( 0 ), "output distance between subsequent sets of data when batch size > 1" ) ( "batchSize,b", po::value< size_t >( &batchSize )->default_value( 1 ), "If this value is greater than one, arrays will be used " ) ( "profile,p", po::value< cl_uint >( &profile_count )->default_value( 1 ), "Time and report the kernel speed of the FFT (default: profiling off)" ) ( "inLayout", po::value< clfftLayout >( &inLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" ) ( "outLayout", po::value< clfftLayout >( &outLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" ) ; po::variables_map vm; po::store( po::parse_command_line( argc, argv, desc ), vm ); po::notify( vm ); if( vm.count( "version" ) ) { const int indent = countOf( "clFFT client API version: " ); tout << std::left << std::setw( indent ) << _T( "clFFT client API version: " ) << clfftVersionMajor << _T( "." ) << clfftVersionMinor << _T( "." ) << clfftVersionPatch << std::endl; cl_uint libMajor, libMinor, libPatch; clfftGetVersion( &libMajor, &libMinor, &libPatch ); tout << std::left << std::setw( indent ) << _T( "clFFT runtime version: " ) << libMajor << _T( "." ) << libMinor << _T( "." ) << libPatch << std::endl << std::endl; } if( vm.count( "help" ) ) { // This needs to be 'cout' as program-options does not support wcout yet std::cout << desc << std::endl; return 0; } size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0) | ((vm.count( "cpu" ) > 0) ? 2 : 0) | ((vm.count( "all" ) > 0) ? 4 : 0); if ((mutex & (mutex-1)) != 0) { terr << _T("You have selected mutually-exclusive OpenCL device options:") << std::endl; if (vm.count ( "gpu" ) > 0) terr << _T(" gpu,g Force selection of OpenCL GPU devices only" ) << std::endl; if (vm.count ( "cpu" ) > 0) terr << _T(" cpu,c Force selection of OpenCL CPU devices only" ) << std::endl; if (vm.count ( "all" ) > 0) terr << _T(" all,a Force selection of all OpenCL devices (default)" ) << std::endl; return 1; } if( vm.count( "gpu" ) ) { deviceType = CL_DEVICE_TYPE_GPU; } if( vm.count( "cpu" ) ) { deviceType = CL_DEVICE_TYPE_CPU; } if( vm.count( "all" ) ) { deviceType = CL_DEVICE_TYPE_ALL; } if( vm.count( "clinfo" ) ) { std::vector< cl_platform_id > platformInfos; std::vector< std::vector< cl_device_id > > deviceInfos; discoverCLPlatforms( deviceType, platformInfos, deviceInfos ); prettyPrintCLPlatforms(platformInfos, deviceInfos); return 0; } bool printInfo = false; if( vm.count( "printChosen" ) ) { printInfo = true; } if( vm.count( "outPlace" ) ) { place = CLFFT_OUTOFPLACE; } if( vm.count( "double" ) ) { precision = CLFFT_DOUBLE; } if( vm.count( "inv" ) ) { dir = CLFFT_BACKWARD; } if( profile_count > 1 ) { command_queue_flags |= CL_QUEUE_PROFILING_ENABLE; } if( vm.count( "dumpKernels" ) ) { setupData->debugFlags |= CLFFT_DUMP_PROGRAMS; } int inL = (int)inLayout; int otL = (int)outLayout; // input output layout support matrix int ioLayoutSupport[5][5] = { { 1, 1, 0, 0, 1 }, { 1, 1, 0, 0, 1 }, { 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 1 }, { 1, 1, 1, 1, 0 }, }; if((inL < 1) || (inL > 5)) throw std::runtime_error( "Invalid Input layout format" ); if((otL < 1) || (otL > 5)) throw std::runtime_error( "Invalid Output layout format" ); if(ioLayoutSupport[inL-1][otL-1] == 0) throw std::runtime_error( "Invalid combination of Input/Output layout formats" ); if( ((inL == 1) || (inL == 2)) && ((otL == 1) || (otL == 2)) ) // Complex-Complex cases { iStrides[1] = iStrides[1] ? iStrides[1] : lengths[0] * iStrides[0]; iStrides[2] = iStrides[2] ? iStrides[2] : lengths[1] * iStrides[1]; iStrides[3] = iStrides[3] ? iStrides[3] : lengths[2] * iStrides[2]; if(place == CLFFT_INPLACE) { oStrides[0] = iStrides[0]; oStrides[1] = iStrides[1]; oStrides[2] = iStrides[2]; oStrides[3] = iStrides[3]; } else { oStrides[1] = oStrides[1] ? oStrides[1] : lengths[0] * oStrides[0]; oStrides[2] = oStrides[2] ? oStrides[2] : lengths[1] * oStrides[1]; oStrides[3] = oStrides[3] ? oStrides[3] : lengths[2] * oStrides[2]; } } else // Real-Complex and Complex-Real cases { size_t *rst, *cst; size_t N = lengths[0]; size_t Nt = 1 + lengths[0]/2; bool iflag = false; bool rcFull = (inL == 1) || (inL == 2) || (otL == 1) || (otL == 2); if(inLayout == CLFFT_REAL) { iflag = true; rst = iStrides; } else { rst = oStrides; } // either in or out should be REAL // Set either in or out strides whichever is real if(place == CLFFT_INPLACE) { if(rcFull) { rst[1] = rst[1] ? rst[1] : N * 2 * rst[0]; } else { rst[1] = rst[1] ? rst[1] : Nt * 2 * rst[0]; } rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1]; rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2]; } else { rst[1] = rst[1] ? rst[1] : lengths[0] * rst[0]; rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1]; rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2]; } // Set the remaining of in or out strides that is not real if(iflag) { cst = oStrides; } else { cst = iStrides; } if(rcFull) { cst[1] = cst[1] ? cst[1] : N * cst[0]; } else { cst[1] = cst[1] ? cst[1] : Nt * cst[0]; } cst[2] = cst[2] ? cst[2] : lengths[1] * cst[1]; cst[3] = cst[3] ? cst[3] : lengths[2] * cst[2]; } if( precision == CLFFT_SINGLE ) transform<float>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, printInfo, command_queue_flags, profile_count, setupData ); else transform<double>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, printInfo, command_queue_flags, profile_count, setupData ); } catch( std::exception& e ) { terr << _T( "clFFT error condition reported:" ) << std::endl << e.what() << std::endl; return 1; } return 0; }
int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size, clfftLayout in_layout, clfftLayout out_layout, clfftResultLocation place, clfftPrecision precision, clfftDirection dir, cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo, cl_uint command_queue_flags, cl_uint profile_count, std::auto_ptr< clfftSetupData > setupData ) { // Our command line does not specify what dimension FFT we wish to transform; we decode // this from the lengths that the user specifies for X, Y, Z. A length of one means that // The user does not want that dimension. const size_t max_dimensions = 3; size_t strides[ 4 ]; size_t o_strides[ 4 ]; size_t fftVectorSize = 0; size_t fftVectorSizePadded = 0; size_t fftBatchSize = 0; size_t outfftVectorSize = 0; size_t outfftVectorSizePadded = 0; size_t outfftBatchSize = 0; size_t size_of_input_buffers_in_bytes = 0; size_t size_of_output_buffers_in_bytes = 0; cl_uint number_of_output_buffers = 0; clfftDim dim = CLFFT_1D; cl_mem input_cl_mem_buffers [2] = { NULL, NULL }; cl_mem output_cl_mem_buffers[2] = { NULL, NULL }; std::vector< cl_device_id > device_id; cl_context context; cl_command_queue queue; cl_event outEvent = NULL; clfftPlanHandle plan_handle; for (unsigned u = 0; u < max_dimensions; ++u) { if (0 != lengths[u]) continue; lengths[u] = 1; } if( lengths[ 1 ] > 1 ) { dim = CLFFT_2D; } if( lengths[ 2 ] > 1 ) { dim = CLFFT_3D; } strides[ 0 ] = inStrides[0]; strides[ 1 ] = inStrides[1]; strides[ 2 ] = inStrides[2]; strides[ 3 ] = inStrides[3]; o_strides[ 0 ] = outStrides[0]; o_strides[ 1 ] = outStrides[1]; o_strides[ 2 ] = outStrides[2]; o_strides[ 3 ] = outStrides[3]; fftVectorSize = lengths[0] * lengths[1] * lengths[2]; fftVectorSizePadded = strides[3]; fftBatchSize = fftVectorSizePadded * batch_size; size_t Nt = 1 + lengths[0]/2; if(place == CLFFT_INPLACE) { outfftVectorSize = fftVectorSize; outfftVectorSizePadded = fftVectorSizePadded; outfftBatchSize = fftBatchSize; } else { outfftVectorSize = lengths[0] * lengths[1] * lengths[2]; outfftVectorSizePadded = o_strides[3]; outfftBatchSize = outfftVectorSizePadded * batch_size; } // Real to complex case if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) ) { fftVectorSizePadded = strides[3]; fftBatchSize = fftVectorSizePadded * batch_size; outfftVectorSizePadded = o_strides[3]; outfftBatchSize = outfftVectorSizePadded * batch_size; fftVectorSize = lengths[0] * lengths[1] * lengths[2]; outfftVectorSize = fftVectorSize; } switch( out_layout ) { case CLFFT_COMPLEX_INTERLEAVED: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > ); break; case CLFFT_COMPLEX_PLANAR: number_of_output_buffers = 2; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; case CLFFT_HERMITIAN_INTERLEAVED: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > ); break; case CLFFT_HERMITIAN_PLANAR: number_of_output_buffers = 2; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; case CLFFT_REAL: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; } // Fill the input buffers switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< std::complex< T > > input( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { input[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; input[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_COMPLEX_PLANAR: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); std::vector< T > imag( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; imag[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; real[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_HERMITIAN_INTERLEAVED: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< std::complex< T > > input( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { input[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; input[p3] = static_cast<T>(outfftVectorSize); } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_HERMITIAN_PLANAR: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); std::vector< T > imag( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; imag[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; real[p3] = static_cast<T>(outfftVectorSize); } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_REAL: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; real[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; default: { throw std::runtime_error( "Input layout format not yet supported" ); } break; } // Discover and load the timer module if present void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false ); if( timerLibHandle == NULL ) { terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl; } // Timer module discovered and loaded successfully // Initialize function pointers to call into the shared module PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) ); // Create and initialize our timer class, if the external timer shared library loaded baseStatTimer* timer = NULL; size_t clFFTID = 0; if( get_timer ) { timer = get_timer( CLFFT_GPU ); timer->Reserve( 1, profile_count ); timer->setNormalize( true ); clFFTID = timer->getUniqueID( "clFFT", 0 ); } OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" ); OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" ); // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" ); OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" ); OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" ); OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" ); OPENCL_V_THROW (clfftSetPlanInStride ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" ); OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" ); OPENCL_V_THROW (clfftSetPlanDistance ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" ); // Set backward scale factor to 1.0 for non real FFTs to do correct output checks if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL) OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" ); OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &queue, NULL, NULL ), "clfftBakePlan failed" ); //get the buffersize size_t buffersize=0; OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" ); //allocate the intermediate buffer cl_mem clMedBuffer=NULL; if (buffersize) { cl_int medstatus; clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus); OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" ); } switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: case CLFFT_COMPLEX_PLANAR: case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_HERMITIAN_PLANAR: case CLFFT_REAL: break; default: // Don't recognize input layout return CLFFT_INVALID_ARG_VALUE; } switch( out_layout ) { case CLFFT_COMPLEX_INTERLEAVED: case CLFFT_COMPLEX_PLANAR: case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_HERMITIAN_PLANAR: case CLFFT_REAL: break; default: // Don't recognize output layout return CLFFT_INVALID_ARG_VALUE; } if (( place == CLFFT_INPLACE ) && ( in_layout != out_layout )) { switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: { if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } case CLFFT_COMPLEX_PLANAR: { if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) ) { throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); } break; } case CLFFT_HERMITIAN_INTERLEAVED: { if( out_layout != CLFFT_REAL ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } case CLFFT_HERMITIAN_PLANAR: { throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); break; } case CLFFT_REAL: { if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } } } // Loop as many times as the user specifies to average out the timings // cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ]; Timer tr; tr.Start(); for( cl_uint i = 0; i < profile_count; ++i ) { if( timer ) timer->Start( clFFTID ); OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent, &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), "clfftEnqueueTransform failed" ); if( timer ) timer->Stop( clFFTID ); } OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); if(clMedBuffer) clReleaseMemObject(clMedBuffer); double wtime = tr.Sample()/((double)profile_count); size_t totalLen = 1; for(int i=0; i<dim; i++) totalLen *= lengths[i]; double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0); if(profile_count > 1) { tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl; tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl; } if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) ) { // Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result timer->pruneOutliers( 2.0 ); timer->Print( ); timer->Reset( ); } /*****************/ FreeSharedLibrary( timerLibHandle ); // Read and check output data // This check is not valid if the FFT is executed multiple times inplace. // if (( place == CLFFT_OUTOFPLACE ) || ( profile_count == 1)) { bool checkflag= false; switch( out_layout ) { case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_COMPLEX_INTERLEAVED: { std::vector< std::complex< T > > output( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } //check output data for( cl_uint i = 0; i < outfftBatchSize; ++i ) { if (0 == (i % outfftVectorSizePadded)) { if (output[i].real() != outfftVectorSize) { checkflag = true; break; } } else { if (output[ i ].real() != 0) { checkflag = true; break; } } if (output[ i ].imag() != 0) { checkflag = true; break; } } } break; case CLFFT_HERMITIAN_PLANAR: case CLFFT_COMPLEX_PLANAR: { std::valarray< T > real( outfftBatchSize ); std::valarray< T > imag( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } // Check output data for( cl_uint i = 0; i < outfftBatchSize; ++i ) { if (0 == (i % outfftVectorSizePadded)) { if (real[i] != outfftVectorSize) { checkflag = true; break; } } else { if (real[i] != 0) { checkflag = true; break; } } if (imag[i] != 0) { checkflag = true; break; } } } break; case CLFFT_REAL: { std::valarray< T > real( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } ////check output data for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * o_strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * o_strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * o_strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * o_strides[0]; if (real[p0] != 1) { checkflag = true; break; } } } } } } break; default: { throw std::runtime_error( "Input layout format not yet supported" ); } break; } if (checkflag) { std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl; } else { std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl; } } OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" ); OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" ); cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent ); return 0; }
//-------------------------------------------------------------------------------- SFBool CSlurperApp::Slurp(COptions& options, SFString& message) { double start = vrNow(); // Do we have the data for this address cached? SFString cacheFilename = cachePath(theAccount.addr+".bin"); SFBool needToRead = SFos::fileExists(cacheFilename); if (options.rerun && theAccount.transactions.getCount()) needToRead=FALSE; if (needToRead) { // Once a transaction is on the blockchain, it will never change // therefore, we can store them in a binary cache. Here we read // from a previously stored cache. SFArchive archive(TRUE, NO_SCHEMA); if (!archive.Lock(cacheFilename, binaryReadOnly, LOCK_NOWAIT)) { message = "Could not open file: '" + cacheFilename + "'\n"; return options.cmdFile; } theAccount.Serialize(archive); archive.Close(); } SFTime now = Now(); SFTime fileTime = SFos::fileLastModifyDate(cacheFilename); // If the user tells us he/she wants to update the cache, or the cache // hasn't been updated in five minutes, then update it SFInt32 nSeconds = MAX(60,config.GetProfileIntGH("SETTINGS", "update_freq", 300)); if (options.slurp || (now - fileTime) > SFTimeSpan(0,0,0,nSeconds)) { // This is how many records we currently have SFInt32 origCount = theAccount.transactions.getCount(); SFInt32 nNewBlocks = 0; SFInt32 nextRecord = origCount; outErr << "\tSlurping new transactions from blockchain...\n"; SFInt32 nRequests = 0, nRead = 0; // We already have 'page' pages, so start there. SFInt32 page = MAX(theAccount.lastPage,1); // Keep reading until we get less than a full page SFString contents; SFBool done = FALSE; while (!done) { SFString url = SFString("https://api.etherscan.io/api?module=account&action=txlist&sort=asc") + "&address=" + theAccount.addr + "&page=" + asString(page) + "&offset=" + asString(options.pageSize) + "&apikey=" + api.getKey(); // Grab a page of data from the web api SFString thisPage = urlToString(url); // See if it's good data, if not, bail message = nextTokenClear(thisPage, '['); if (!message.Contains("{\"status\":\"1\",\"message\":\"OK\"")) { if (message.Contains("{\"status\":\"0\",\"message\":\"No transactions found\",\"result\":")) message = "No transactions were found for address '" + theAccount.addr + "'. Is it correct?"; return options.cmdFile; } contents += thisPage; SFInt32 nRecords = countOf('}',thisPage)-1; nRead += nRecords; outErr << "\tDownloaded " << nRead << " potentially new transactions." << (isTesting?"\n":"\r"); // If we got a full page, there are more to come done = (nRecords < options.pageSize); if (!done) page++; // Etherscan.io doesn't want more than five pages per second, so sleep for a second if (++nRequests==4) { SFos::sleep(1.0); nRequests=0; } // Make sure we don't spin forever if (nRead >= options.maxTransactions) done=TRUE; } SFInt32 minBlock=0,maxBlock=0; findBlockRange(contents, minBlock, maxBlock); outErr << "\n\tDownload contains blocks from " << minBlock << " to " << maxBlock << "\n"; // Keep track of which last full page we've read theAccount.lastPage = page; theAccount.pageSize = options.pageSize; // pre allocate the array theAccount.transactions.Grow(nRead); SFInt32 lastBlock=0; char *p = cleanUpJson((char *)(const char*)contents); while (p && *p) { CTransaction trans;SFInt32 nFields=0; p = trans.parseJson(p,nFields); if (nFields) { SFInt32 transBlock = trans.blockNumber; if (transBlock > theAccount.lastBlock) // add the new transaction if it's in a new block { theAccount.transactions[nextRecord++] = trans; lastBlock = transBlock; if (!(++nNewBlocks%REP_FREQ)) { outErr << "\tFound new transaction at block " << transBlock << ". Importing..." << (isTesting?"\n":"\r"); outErr.Flush(); } } } } if (!isTesting && nNewBlocks) { outErr << "\tFound new transaction at block " << lastBlock << ". Importing...\n"; outErr.Flush(); } theAccount.lastBlock = lastBlock; // Write the data if we got new data SFInt32 newRecords = (theAccount.transactions.getCount() - origCount); if (newRecords) { outErr << "\tWriting " << newRecords << " new records to cache\n"; SFArchive archive(FALSE, NO_SCHEMA); if (archive.Lock(cacheFilename, binaryWriteCreate, LOCK_CREATE)) { theAccount.transactions.Sort(sortTransactionsForWrite); theAccount.Serialize(archive); archive.Close(); } else { message = "Could not open file: '" + cacheFilename + "'\n"; return options.cmdFile; } } } if (!isTesting) { double stop = vrNow(); double timeSpent = stop-start; fprintf(stderr, "\tLoaded %ld total records in %f seconds\n", theAccount.transactions.getCount(), timeSpent); fflush(stderr); } return (options.cmdFile || theAccount.transactions.getCount()>0); }
int _tmain( int argc, _TCHAR* argv[] ) { cl_uint userPlatform = 0; cl_uint userDevice = 0; size_t iterations = 0; size_t length = 0; size_t algo = 1; cl_device_type deviceType = CL_DEVICE_TYPE_DEFAULT; bool defaultDevice = true; bool print_clInfo = false; bool systemMemory = false; /****************************************************************************** * Parameter parsing * ******************************************************************************/ try { // Declare the supported options. po::options_description desc( "OpenCL CopyBuffer command line options" ); desc.add_options() ( "help,h", "produces this help message" ) ( "version,v", "Print queryable version information from the Bolt CL library" ) ( "queryOpenCL,q", "Print queryable platform and device info and return" ) ( "gpu,g", "Report only OpenCL GPU devices" ) ( "cpu,c", "Report only OpenCL CPU devices" ) ( "all,a", "Report all OpenCL devices" ) ( "systemMemory,s", "Allocate vectors in system memory, otherwise device memory" ) ( "platform,p", po::value< cl_uint >( &userPlatform )->default_value( 0 ), "Specify the platform under test using the index reported by -q flag" ) ( "device,d", po::value< cl_uint >( &userDevice )->default_value( 0 ), "Specify the device under test using the index reported by the -q flag. " "Index is relative with respect to -g, -c or -a flags" ) ( "length,l", po::value< size_t >( &length )->default_value( 1048576 ), "Specify the length of scan array" ) ( "iterations,i", po::value< size_t >( &iterations )->default_value( 50 ), "Number of samples in timing loop" ) //( "algo,a", po::value< size_t >( &algo )->default_value( 1 ), "Algorithm used [1,2] 1:SCAN_BOLT, 2:XYZ" )//Not used in this file ; po::variables_map vm; po::store( po::parse_command_line( argc, argv, desc ), vm ); po::notify( vm ); if( vm.count( "version" ) ) { cl_uint libMajor, libMinor, libPatch; bolt::cl::getVersion( libMajor, libMinor, libPatch ); const int indent = countOf( "Bolt version: " ); bolt::tout << std::left << std::setw( indent ) << _T( "Bolt version: " ) << libMajor << _T( "." ) << libMinor << _T( "." ) << libPatch << std::endl; } if( vm.count( "help" ) ) { // This needs to be 'cout' as program-options does not support wcout yet std::cout << desc << std::endl; return 0; } if( vm.count( "queryOpenCL" ) ) { print_clInfo = true; } if( vm.count( "gpu" ) ) { deviceType = CL_DEVICE_TYPE_GPU; } if( vm.count( "cpu" ) ) { deviceType = CL_DEVICE_TYPE_CPU; } if( vm.count( "all" ) ) { deviceType = CL_DEVICE_TYPE_ALL; } if( vm.count( "systemMemory" ) ) { systemMemory = true; } } catch( std::exception& e ) { std::cout << _T( "Scan Benchmark error condition reported:" ) << std::endl << e.what() << std::endl; return 1; } /****************************************************************************** * Initialize platforms and devices * * /todo we should move this logic inside of the control class * ******************************************************************************/ // Query OpenCL for available platforms cl_int err = CL_SUCCESS; // Platform vector contains all available platforms on system std::vector< cl::Platform > platforms; bolt::cl::V_OPENCL( cl::Platform::get( &platforms ), "Platform::get() failed" ); if( print_clInfo ) { // /todo: port the printing code from test/scan to control class //std::for_each( platforms.begin( ), platforms.end( ), printPlatformFunctor( 0 ) ); return 0; } // Device info std::vector< cl::Device > devices; bolt::cl::V_OPENCL( platforms.at( userPlatform ).getDevices( deviceType, &devices ), "Platform::getDevices() failed" ); cl::Context myContext( devices.at( userDevice ) ); cl::CommandQueue myQueue( myContext, devices.at( userDevice ) ); // Now that the device we want is selected and we have created our own cl::CommandQueue, set it as the // default cl::CommandQueue for the Bolt API bolt::cl::control::getDefault( ).setCommandQueue( myQueue ); std::string strDeviceName = bolt::cl::control::getDefault( ).getDevice( ).getInfo< CL_DEVICE_NAME >( &err ); bolt::cl::V_OPENCL( err, "Device::getInfo< CL_DEVICE_NAME > failed" ); std::cout << "Device under test : " << strDeviceName << std::endl; /****************************************************************************** * Benchmark logic * ******************************************************************************/ bolt::statTimer& myTimer = bolt::statTimer::getInstance( ); myTimer.Reserve( 1, iterations ); size_t scanId = myTimer.getUniqueID( _T( "copybuffer" ), 0 ); size_t pruned = 0; double scanTime = std::numeric_limits< double >::max( ); double scanGB = ( length * sizeof( int ) ) / (1024.0 * 1024.0 * 1024.0); ::cl::CommandQueue& boltQueue = bolt::cl::control::getDefault( ).getCommandQueue( ); // ::cl::Buffer can not handle buffers of size 0 if( length > 0 ) { if( systemMemory ) { std::vector< int > input( length, 1 ); std::vector< int > output( length ); ::cl::Buffer inputBuffer( bolt::cl::control::getDefault( ).getContext( ), CL_MEM_USE_HOST_PTR|CL_MEM_READ_ONLY, length * sizeof( int ), input.data( ) ); ::cl::Buffer outputBuffer( bolt::cl::control::getDefault( ).getContext( ), CL_MEM_USE_HOST_PTR|CL_MEM_WRITE_ONLY, length * sizeof( int ), output.data( ) ); for( unsigned i = 0; i < iterations; ++i ) { myTimer.Start( scanId ); boltQueue.enqueueCopyBuffer( inputBuffer, outputBuffer, 0, 0, length * sizeof( int ) ); void* tmpPtr = boltQueue.enqueueMapBuffer( outputBuffer, true, CL_MAP_READ, 0, length * sizeof( int ) ); boltQueue.enqueueUnmapMemObject( outputBuffer, tmpPtr ); boltQueue.finish( ); myTimer.Stop( scanId ); } } else { ::cl::Buffer inputBuffer( bolt::cl::control::getDefault( ).getContext( ), CL_MEM_READ_ONLY, length * sizeof( int ) ); ::cl::Buffer outputBuffer( bolt::cl::control::getDefault( ).getContext( ), CL_MEM_WRITE_ONLY, length * sizeof( int ) ); for( unsigned i = 0; i < iterations; ++i ) { myTimer.Start( scanId ); boltQueue.enqueueCopyBuffer( inputBuffer, outputBuffer, 0, 0, length * sizeof( int ) ); boltQueue.finish( ); myTimer.Stop( scanId ); } } // Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result pruned = myTimer.pruneOutliers( 1.0 ); scanTime = myTimer.getAverageTime( scanId ); } else { iterations = 0; } bolt::tout << std::left; bolt::tout << std::setw( colWidth ) << _T( "CopyBuffer profile: " ) << _T( "[" ) << iterations-pruned << _T( "] samples" ) << std::endl; bolt::tout << std::setw( colWidth ) << _T( " Size (GB): " ) << scanGB << std::endl; bolt::tout << std::setw( colWidth ) << _T( " Time (s): " ) << scanTime << std::endl; bolt::tout << std::setw( colWidth ) << _T( " Speed (GB/s): " ) << scanGB / scanTime << std::endl; bolt::tout << std::endl; // bolt::tout << myTimer; return 0; }