__device__ void single_precision_intrinsics() { float fX, fY; __cosf(0.0f); __exp10f(0.0f); __expf(0.0f); __fadd_rd(0.0f, 1.0f); __fadd_rn(0.0f, 1.0f); __fadd_ru(0.0f, 1.0f); __fadd_rz(0.0f, 1.0f); __fdiv_rd(4.0f, 2.0f); __fdiv_rn(4.0f, 2.0f); __fdiv_ru(4.0f, 2.0f); __fdiv_rz(4.0f, 2.0f); __fdividef(4.0f, 2.0f); __fmaf_rd(1.0f, 2.0f, 3.0f); __fmaf_rn(1.0f, 2.0f, 3.0f); __fmaf_ru(1.0f, 2.0f, 3.0f); __fmaf_rz(1.0f, 2.0f, 3.0f); __fmul_rd(1.0f, 2.0f); __fmul_rn(1.0f, 2.0f); __fmul_ru(1.0f, 2.0f); __fmul_rz(1.0f, 2.0f); __frcp_rd(2.0f); __frcp_rn(2.0f); __frcp_ru(2.0f); __frcp_rz(2.0f); __frsqrt_rn(4.0f); __fsqrt_rd(4.0f); __fsqrt_rn(4.0f); __fsqrt_ru(4.0f); __fsqrt_rz(4.0f); __fsub_rd(2.0f, 1.0f); __fsub_rn(2.0f, 1.0f); __fsub_ru(2.0f, 1.0f); __fsub_rz(2.0f, 1.0f); __log10f(1.0f); __log2f(1.0f); __logf(1.0f); __powf(1.0f, 0.0f); __saturatef(0.1f); __sincosf(0.0f, &fX, &fY); __sinf(0.0f); __tanf(0.0f); }
void BlackScholesGPU( float *d_CallResult, float *d_PutResult, float *d_StockPrice, float *d_OptionStrike, float *d_OptionYears, float Riskfree, float Volatility, int optN ) { //Thread index const int tid = (__builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()) + __builtin_ptx_read_tid_x(); //Total number of threads in execution grid const int THREAD_N = __builtin_ptx_read_nctaid_x() * __builtin_ptx_read_ntid_x(); //No matter how small is execution grid or how large OptN is, //exactly OptN indices will be processed with perfect memory coalescing for (int opt = tid; opt < optN; opt += THREAD_N) { float sqrtT, expRT; float d1, d2, CNDD1, CNDD2; sqrtT = sqrtf(T); d1 = (__logf(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT); d2 = d1 - V * sqrtT; //CNDD1 = cndGPU(d1); const float A1 = 0.31938153f; const float A2 = -0.356563782f; const float A3 = 1.781477937f; const float A4 = -1.821255978f; const float A5 = 1.330274429f; const float RSQRT2PI = 0.39894228040143267793994605993438f; float K = 1.0f / (1.0f + 0.2316419f * fabsf(d)); float cnd = RSQRT2PI * __expf(- 0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); if (d > 0) CNDD1 = 1.0f - cnd; //CNDD2 = cndGPU(d2); K = 1.0f / (1.0f + 0.2316419f * fabsf(d)); cnd = RSQRT2PI * __expf(- 0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); if (d > 0) CNDD2 = 1.0f - cnd; //Calculate Call and Put simultaneously expRT = __expf(- R * T); CallResult = S * CNDD1 - X * expRT * CNDD2; PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); } }