static float lg_cospi (float x) { if (x <= 0.25f) return __cosf ((float) M_PI * x); else return __sinf ((float) M_PI * (0.5f - x)); }
float __ieee754_y1f(float x) { float z, s,c,ss,cc,u,v; int32_t hx,ix; GET_FLOAT_WORD(hx,x); ix = 0x7fffffff&hx; /* if Y1(NaN) is NaN, Y1(-inf) is NaN, Y1(inf) is 0 */ if(__builtin_expect(ix>=0x7f800000, 0)) return one/(x+x*x); if(__builtin_expect(ix==0, 0)) return -HUGE_VALF+x; /* -inf and overflow exception. */ if(__builtin_expect(hx<0, 0)) return zero/(zero*x); if(ix >= 0x40000000) { /* |x| >= 2.0 */ SET_RESTORE_ROUNDF (FE_TONEAREST); __sincosf (x, &s, &c); ss = -s-c; cc = s-c; if(ix<0x7f000000) { /* make sure x+x not overflow */ z = __cosf(x+x); if ((s*c)>zero) cc = z/ss; else ss = z/cc; } /* y1(x) = sqrt(2/(pi*x))*(p1(x)*sin(x0)+q1(x)*cos(x0)) * where x0 = x-3pi/4 * Better formula: * cos(x0) = cos(x)cos(3pi/4)+sin(x)sin(3pi/4) * = 1/sqrt(2) * (sin(x) - cos(x)) * sin(x0) = sin(x)cos(3pi/4)-cos(x)sin(3pi/4) * = -1/sqrt(2) * (cos(x) + sin(x)) * To avoid cancellation, use * sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x)) * to compute the worse one. */ if(ix>0x48000000) z = (invsqrtpi*ss)/__ieee754_sqrtf(x); else { u = ponef(x); v = qonef(x); z = invsqrtpi*(u*ss+v*cc)/__ieee754_sqrtf(x); } return z; } if(__builtin_expect(ix<=0x33000000, 0)) { /* x < 2**-25 */ z = -tpi / x; if (__isinff (z)) __set_errno (ERANGE); return z; } z = x*x; u = U0[0]+z*(U0[1]+z*(U0[2]+z*(U0[3]+z*U0[4]))); v = one+z*(V0[0]+z*(V0[1]+z*(V0[2]+z*(V0[3]+z*V0[4])))); return(x*(u/v) + tpi*(__ieee754_j1f(x)*__ieee754_logf(x)-one/x)); }
float __ieee754_y0f(float x) { float z, s,c,ss,cc,u,v; int32_t hx,ix; GET_FLOAT_WORD(hx,x); ix = 0x7fffffff&hx; /* Y0(NaN) is NaN, y0(-inf) is Nan, y0(inf) is 0, y0(0) is -inf. */ if(ix>=0x7f800000) return one/(x+x*x); if(ix==0) return -HUGE_VALF+x; /* -inf and overflow exception. */ if(hx<0) return zero/(zero*x); if(ix >= 0x40000000) { /* |x| >= 2.0 */ /* y0(x) = sqrt(2/(pi*x))*(p0(x)*sin(x0)+q0(x)*cos(x0)) * where x0 = x-pi/4 * Better formula: * cos(x0) = cos(x)cos(pi/4)+sin(x)sin(pi/4) * = 1/sqrt(2) * (sin(x) + cos(x)) * sin(x0) = sin(x)cos(3pi/4)-cos(x)sin(3pi/4) * = 1/sqrt(2) * (sin(x) - cos(x)) * To avoid cancellation, use * sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x)) * to compute the worse one. */ __sincosf (x, &s, &c); ss = s-c; cc = s+c; /* * j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x) * y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x) */ if(ix<0x7f000000) { /* make sure x+x not overflow */ z = -__cosf(x+x); if ((s*c)<zero) cc = z/ss; else ss = z/cc; } if(ix>0x48000000) z = (invsqrtpi*ss)/__ieee754_sqrtf(x); else { u = pzerof(x); v = qzerof(x); z = invsqrtpi*(u*ss+v*cc)/__ieee754_sqrtf(x); } return z; } if(ix<=0x32000000) { /* x < 2**-27 */ return(u00 + tpi*__ieee754_logf(x)); } z = x*x; u = u00+z*(u01+z*(u02+z*(u03+z*(u04+z*(u05+z*u06))))); v = one+z*(v01+z*(v02+z*(v03+z*v04))); return(u/v + tpi*(__ieee754_j0f(x)*__ieee754_logf(x))); }
__device__ void single_precision_intrinsics() { float fX, fY; __cosf(0.0f); __exp10f(0.0f); __expf(0.0f); __fadd_rd(0.0f, 1.0f); __fadd_rn(0.0f, 1.0f); __fadd_ru(0.0f, 1.0f); __fadd_rz(0.0f, 1.0f); __fdiv_rd(4.0f, 2.0f); __fdiv_rn(4.0f, 2.0f); __fdiv_ru(4.0f, 2.0f); __fdiv_rz(4.0f, 2.0f); __fdividef(4.0f, 2.0f); __fmaf_rd(1.0f, 2.0f, 3.0f); __fmaf_rn(1.0f, 2.0f, 3.0f); __fmaf_ru(1.0f, 2.0f, 3.0f); __fmaf_rz(1.0f, 2.0f, 3.0f); __fmul_rd(1.0f, 2.0f); __fmul_rn(1.0f, 2.0f); __fmul_ru(1.0f, 2.0f); __fmul_rz(1.0f, 2.0f); __frcp_rd(2.0f); __frcp_rn(2.0f); __frcp_ru(2.0f); __frcp_rz(2.0f); __frsqrt_rn(4.0f); __fsqrt_rd(4.0f); __fsqrt_rn(4.0f); __fsqrt_ru(4.0f); __fsqrt_rz(4.0f); __fsub_rd(2.0f, 1.0f); __fsub_rn(2.0f, 1.0f); __fsub_ru(2.0f, 1.0f); __fsub_rz(2.0f, 1.0f); __log10f(1.0f); __log2f(1.0f); __logf(1.0f); __powf(1.0f, 0.0f); __saturatef(0.1f); __sincosf(0.0f, &fX, &fY); __sinf(0.0f); __tanf(0.0f); }
float __ieee754_j0f(float x) { float z, s,c,ss,cc,r,u,v; int32_t hx,ix; GET_FLOAT_WORD(hx,x); ix = hx&0x7fffffff; if(ix>=0x7f800000) return one/(x*x); x = fabsf(x); if(ix >= 0x40000000) { /* |x| >= 2.0 */ __sincosf (x, &s, &c); ss = s-c; cc = s+c; if(ix<0x7f000000) { /* make sure x+x not overflow */ z = -__cosf(x+x); if ((s*c)<zero) cc = z/ss; else ss = z/cc; } /* * j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x) * y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x) */ if(ix>0x48000000) z = (invsqrtpi*cc)/__ieee754_sqrtf(x); else { u = pzerof(x); v = qzerof(x); z = invsqrtpi*(u*cc-v*ss)/__ieee754_sqrtf(x); } return z; } if(ix<0x39000000) { /* |x| < 2**-13 */ math_force_eval(huge+x); /* raise inexact if x != 0 */ if(ix<0x32000000) return one; /* |x|<2**-27 */ else return one - (float)0.25*x*x; } z = x*x; r = z*(R02+z*(R03+z*(R04+z*R05))); s = one+z*(S01+z*(S02+z*(S03+z*S04))); if(ix < 0x3F800000) { /* |x| < 1.00 */ return one + z*((float)-0.25+(r/s)); } else { u = (float)0.5*x; return((one+u)*(one-u)+z*(r/s)); } }
float __ieee754_j1f(float x) { float z, s,c,ss,cc,r,u,v,y; int32_t hx,ix; GET_FLOAT_WORD(hx,x); ix = hx&0x7fffffff; if(__builtin_expect(ix>=0x7f800000, 0)) return one/x; y = fabsf(x); if(ix >= 0x40000000) { /* |x| >= 2.0 */ __sincosf (y, &s, &c); ss = -s-c; cc = s-c; if(ix<0x7f000000) { /* make sure y+y not overflow */ z = __cosf(y+y); if ((s*c)>zero) cc = z/ss; else ss = z/cc; } /* * j1(x) = 1/sqrt(pi) * (P(1,x)*cc - Q(1,x)*ss) / sqrt(x) * y1(x) = 1/sqrt(pi) * (P(1,x)*ss + Q(1,x)*cc) / sqrt(x) */ if(ix>0x48000000) z = (invsqrtpi*cc)/__ieee754_sqrtf(y); else { u = ponef(y); v = qonef(y); z = invsqrtpi*(u*cc-v*ss)/__ieee754_sqrtf(y); } if(hx<0) return -z; else return z; } if(__builtin_expect(ix<0x32000000, 0)) { /* |x|<2**-27 */ if(huge+x>one) return (float)0.5*x;/* inexact if x!=0 necessary */ } z = x*x; r = z*(r00+z*(r01+z*(r02+z*r03))); s = one+z*(s01+z*(s02+z*(s03+z*(s04+z*s05)))); r *= x; return(x*(float)0.5+r/s); }
/** * * @param elong * @param phase * @param posX * @param posZ * @return */ HDINLINE float3_X laserTransversal( float3_X elong, float_X phase, const float_X posX, const float_X posZ ) { //const float_X modMue = float_X(PI) * float_X(SPEED_OF_LIGHT / WAVE_LENGTH) * INIT_TIME; const float_X f = SPEED_OF_LIGHT / WAVE_LENGTH; const float_X timeShift = phase / (2.0f * float_X(PI) * float_X(f)) + FOCUS_POS / SPEED_OF_LIGHT; const float_X spaceShift = SPEED_OF_LIGHT * tanf(TILT_X) * timeShift / CELL_HEIGHT; const float_X r2 = (posX + spaceShift) * (posX + spaceShift) + posZ * posZ; // pure gaussian //const float_X r2 = posX * posX + posZ * posZ; //rayleigh length (in y-direction) const float_X y_R = float_X( PI ) * W0 * W0 / WAVE_LENGTH; // the radius of curvature of the beam's wavefronts const float_X R_y = -FOCUS_POS * ( float_X(1.0) + ( y_R / FOCUS_POS )*( y_R / FOCUS_POS ) ); #if !defined(__CUDA_ARCH__) // Host code path //beam waist in the near field: w_y(y=0) == W0 const float_X w_y = W0 * sqrt( float_X(1.0) + ( FOCUS_POS / y_R )*( FOCUS_POS / y_R ) ); //! the Gouy phase shift const float_X xi_y = atan( -FOCUS_POS / y_R ); if( Polarisation == LINEAR_X || Polarisation == LINEAR_Z ) { elong *= math::exp( -r2 / w_y / w_y ) * cos( 2.0f * float_X( PI ) / WAVE_LENGTH * FOCUS_POS - 2.0f * float_X( PI ) / WAVE_LENGTH * r2 / 2.0f / R_y + xi_y + phase ) * math::exp( -( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) *( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0f * PULSE_LENGTH ) / ( 2.0f * PULSE_LENGTH ) ); } else if( Polarisation == CIRCULAR ) { elong.x() *= math::exp( -r2 / w_y / w_y ) * cos( 2.0f * float_X( PI ) / WAVE_LENGTH * FOCUS_POS - 2.0f * float_X( PI ) / WAVE_LENGTH * r2 / 2.0f / R_y + xi_y + phase ) * math::exp( -( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) *( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0f * PULSE_LENGTH ) / ( 2.0f * PULSE_LENGTH ) ); phase += float_X( PI / 2.0 ); elong.z() *= math::exp( -r2 / w_y / w_y ) * cos( 2.0f * float_X( PI ) / WAVE_LENGTH * FOCUS_POS - 2.0f * float_X( PI ) / WAVE_LENGTH * r2 / 2.0f / R_y + xi_y + phase ) * math::exp( -( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) *( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0f * PULSE_LENGTH ) / ( 2.0f * PULSE_LENGTH ) ); phase -= float_X( PI / 2.0 ); } return elong; #else //beam waist in the near field: w_y(y=0) == W0 const float_X w_y = W0 * sqrtf( float_X(1.0) + ( FOCUS_POS / y_R )*( FOCUS_POS / y_R ) ); //! the Gouy phase shift const float_X xi_y = atanf( -FOCUS_POS / y_R ); if( Polarisation == LINEAR_X || Polarisation == LINEAR_Z ) { elong *= math::exp( -r2 / w_y / w_y ) * __cosf( 2.0f * float_X( PI ) / WAVE_LENGTH * FOCUS_POS - 2.0f * float_X( PI ) / WAVE_LENGTH * r2 / 2.0f / R_y + xi_y + phase ) * math::exp( -( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) *( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0f * PULSE_LENGTH ) / ( 2.0f * PULSE_LENGTH ) ); } else if( Polarisation == CIRCULAR ) { elong.x() *= math::exp( -r2 / w_y / w_y ) * __cosf( 2.0f * float_X( PI ) / WAVE_LENGTH * FOCUS_POS - 2.0f * float_X( PI ) / WAVE_LENGTH * r2 / 2.0f / R_y + xi_y + phase ) * math::exp( -( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) *( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0f * PULSE_LENGTH ) / ( 2.0f * PULSE_LENGTH ) ); phase += float_X( PI / 2.0 ); elong.z() *= math::exp( -r2 / w_y / w_y ) * __cosf( 2.0f * float_X( PI ) / WAVE_LENGTH * FOCUS_POS - 2.0f * float_X( PI ) / WAVE_LENGTH * r2 / 2.0f / R_y + xi_y + phase ) * math::exp( -( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) *( r2 / 2.0f / R_y - FOCUS_POS - phase / 2.0f / float_X( PI ) * WAVE_LENGTH ) / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0f * PULSE_LENGTH ) / ( 2.0f * PULSE_LENGTH ) ); phase -= float_X( PI / 2.0 ); } return elong; #endif }
__device__ inline float occaCuda_fastCos(const float x){ return __cosf(x); }
DINLINE float operator()(const float& value) const { return __cosf(value); }
float __ieee754_gammaf_r (float x, int *signgamp) { int32_t hx; float ret; GET_FLOAT_WORD (hx, x); if (__glibc_unlikely ((hx & 0x7fffffff) == 0)) { /* Return value for x == 0 is Inf with divide by zero exception. */ *signgamp = 0; return 1.0 / x; } if (__builtin_expect (hx < 0, 0) && (u_int32_t) hx < 0xff800000 && __rintf (x) == x) { /* Return value for integer x < 0 is NaN with invalid exception. */ *signgamp = 0; return (x - x) / (x - x); } if (__glibc_unlikely (hx == 0xff800000)) { /* x == -Inf. According to ISO this is NaN. */ *signgamp = 0; return x - x; } if (__glibc_unlikely ((hx & 0x7f800000) == 0x7f800000)) { /* Positive infinity (return positive infinity) or NaN (return NaN). */ *signgamp = 0; return x + x; } if (x >= 36.0f) { /* Overflow. */ *signgamp = 0; ret = math_narrow_eval (FLT_MAX * FLT_MAX); return ret; } else { SET_RESTORE_ROUNDF (FE_TONEAREST); if (x > 0.0f) { *signgamp = 0; int exp2_adj; float tret = gammaf_positive (x, &exp2_adj); ret = __scalbnf (tret, exp2_adj); } else if (x >= -FLT_EPSILON / 4.0f) { *signgamp = 0; ret = 1.0f / x; } else { float tx = __truncf (x); *signgamp = (tx == 2.0f * __truncf (tx / 2.0f)) ? -1 : 1; if (x <= -42.0f) /* Underflow. */ ret = FLT_MIN * FLT_MIN; else { float frac = tx - x; if (frac > 0.5f) frac = 1.0f - frac; float sinpix = (frac <= 0.25f ? __sinf ((float) M_PI * frac) : __cosf ((float) M_PI * (0.5f - frac))); int exp2_adj; float tret = (float) M_PI / (-x * sinpix * gammaf_positive (-x, &exp2_adj)); ret = __scalbnf (tret, -exp2_adj); math_check_force_underflow_nonneg (ret); } } ret = math_narrow_eval (ret); } if (isinf (ret) && x != 0) { if (*signgamp < 0) { ret = math_narrow_eval (-__copysignf (FLT_MAX, ret) * FLT_MAX); ret = -ret; } else ret = math_narrow_eval (__copysignf (FLT_MAX, ret) * FLT_MAX); return ret; } else if (ret == 0) { if (*signgamp < 0) { ret = math_narrow_eval (-__copysignf (FLT_MIN, ret) * FLT_MIN); ret = -ret; } else ret = math_narrow_eval (__copysignf (FLT_MIN, ret) * FLT_MIN); return ret; } else return ret; }