__device__ __forceinline__ T ldg(const T* ptr) { #if __CUDA_ARCH__ >= 350 return __ldg(ptr); #else return *ptr; #endif }
KOKKOS_INLINE_FUNCTION ValueType operator[]( const iType & i ) const { #ifdef __CUDA_ARCH__ AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i])); return *(reinterpret_cast<ValueType*> (&v)); #else return m_ptr[i]; #endif }
KOKKOS_INLINE_FUNCTION ValueType operator[]( const iType & i ) const { #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i])); return *(reinterpret_cast<ValueType*> (&v)); #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset ); return *(reinterpret_cast<ValueType*> (&v)); #else return m_alloc_ptr[ i + m_offset ]; #endif }
__device__ __inline__ void compute_detJ(int elethidx, const Scalar elemNodeCoords[Hex8::spatialDim], Scalar& detJ, const Scalar *gradients) { Scalar J0=0,J1=0,J2=0,J3=0,J4=0,J5=0,J6=0,J7=0,J8=0; Scalar grad_vals0, grad_vals1, grad_vals2; #pragma unroll for(size_t i=0; i<Hex8::numNodesPerElem; ++i) { int texidx = elethidx*spatialDim*numNodesPerElem+i*spatialDim; grad_vals0=__ldg(gradients+texidx+0); grad_vals1=__ldg(gradients+texidx+1); grad_vals2=__ldg(gradients+texidx+2); Scalar elemNodeCoords0=elemNodeCoords[Hex8::spatialDim*i]; Scalar elemNodeCoords1=elemNodeCoords[Hex8::spatialDim*i+1]; Scalar elemNodeCoords2=elemNodeCoords[Hex8::spatialDim*i+2]; J0 += grad_vals0*elemNodeCoords0; J1 += grad_vals0*elemNodeCoords1; J2 += grad_vals0*elemNodeCoords2; J3 += grad_vals1*elemNodeCoords0; J4 += grad_vals1*elemNodeCoords1; J5 += grad_vals1*elemNodeCoords2; J6 += grad_vals2*elemNodeCoords0; J7 += grad_vals2*elemNodeCoords1; J8 += grad_vals2*elemNodeCoords2; } Scalar term0 = J8*J4 - J7*J5; Scalar term1 = J8*J1 - J7*J2; Scalar term2 = J5*J1 - J4*J2; detJ = J0*term0 - J3*term1 + J6*term2; }
KOKKOS_INLINE_FUNCTION ValueType operator[]( const iType & i ) const { AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_ptr[i])); return *(reinterpret_cast<ValueType*> (&v)); }
__device__ void compute_detJ_invJ_grad_vals(int elethidx, const Scalar elemNodeCoords[Hex8::spatialDim], Scalar invJ_grad_vals[numNodesPerElem*spatialDim], Scalar& detJ, const Scalar *gradients) { Scalar J0=0,J1=0,J2=0,J3=0,J4=0,J5=0,J6=0,J7=0,J8=0; Scalar grad_vals0, grad_vals1, grad_vals2; #pragma unroll for(size_t i=0; i<numNodesPerElem; ++i) { int texidx = elethidx*spatialDim*numNodesPerElem+i*spatialDim; grad_vals0=__ldg(gradients + texidx+0); grad_vals1=__ldg(gradients + texidx+1); grad_vals2=__ldg(gradients + texidx+2); Scalar elemNodeCoords0=elemNodeCoords[i*Hex8::spatialDim]; Scalar elemNodeCoords1=elemNodeCoords[i*Hex8::spatialDim+1]; Scalar elemNodeCoords2=elemNodeCoords[i*Hex8::spatialDim+2]; J0 += grad_vals0*elemNodeCoords0; J1 += grad_vals0*elemNodeCoords1; J2 += grad_vals0*elemNodeCoords2; J3 += grad_vals1*elemNodeCoords0; J4 += grad_vals1*elemNodeCoords1; J5 += grad_vals1*elemNodeCoords2; J6 += grad_vals2*elemNodeCoords0; J7 += grad_vals2*elemNodeCoords1; J8 += grad_vals2*elemNodeCoords2; } Scalar term0 = J8*J4 - J7*J5; Scalar term1 = J8*J1 - J7*J2; Scalar term2 = J5*J1 - J4*J2; Scalar term3 = J8*J3 - J6*J5; Scalar term4 = J8*J0 - J6*J2; Scalar term5 = J5*J0 - J3*J2; Scalar term6 = J7*J3 - J6*J4; Scalar term7 = J7*J0 - J6*J1; Scalar term8 = J4*J0 - J3*J1; detJ = J0*term0 - J3*term1 + J6*term2; Scalar inv_detJ = 1.0/detJ; J0 = term0*inv_detJ; J1 = -term1*inv_detJ; J2 = term2*inv_detJ; J3 = -term3*inv_detJ; J4 = term4*inv_detJ; J5 = -term5*inv_detJ; J6 = term6*inv_detJ; J7 = -term7*inv_detJ; J8 = term8*inv_detJ; #pragma unroll for(int j=0; j<numNodesPerElem; ++j) { int texidx = elethidx*spatialDim*numNodesPerElem+j*spatialDim; Scalar gv0=__ldg(gradients+texidx+0); Scalar gv1=__ldg(gradients+texidx+1); Scalar gv2=__ldg(gradients+texidx+2); invJ_grad_vals[j*spatialDim+0] = J0 * gv0 + J1 * gv1 + J2 * gv2; invJ_grad_vals[j*spatialDim+1] = J3 * gv0 + J4 * gv1 + J5 * gv2; invJ_grad_vals[j*spatialDim+2] = J6 * gv0 + J7 * gv1 + J8 * gv2; } }