/* Estimate time for a nu step in milliseconds */ cl_double cudaEstimateIterTime(const DevInfo* di, cl_double flopsPerIter, cl_double flops) { cl_double devFactor; /* Experimentally determined constants */ devFactor = mwComputeCapabilityIs(di, 1, 3) ? 1.87 : 1.53; /* Idea is this is a sort of efficiency factor for the * architecture vs. the theoretical FLOPs. We can then scale by * the theoretical flops compared to the reference devices. */ return 1000.0 * devFactor * flopsPerIter / flops; }
/* Return flag for Nvidia compiler for maximum registers to use. */ static const char* getNvidiaRegCount(const DevInfo* di) { const char* regCount32 = "-cl-nv-maxrregcount=32 "; const char* regDefault = ""; if (mwComputeCapabilityIs(di, 1, 3)) /* 1.3 == GT200 */ { /* 32 allows for greatest number of threads at a time */ mw_printf("Found a compute capability 1.3 device. Using %s\n", regCount32); return regCount32; } /* Higher or other is Fermi or unknown, */ return regDefault; }