Ejemplo n.º 1
0
__device__ __forceinline__ T ldg(const T* ptr) {
#if __CUDA_ARCH__ >= 350
    return __ldg(ptr);
#else
    return *ptr;
#endif
}
Ejemplo n.º 2
0
 KOKKOS_INLINE_FUNCTION
 ValueType operator[]( const iType & i ) const
   {
     #ifdef __CUDA_ARCH__
     AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i]));
     return  *(reinterpret_cast<ValueType*> (&v));
     #else
     return m_ptr[i];
     #endif
   }
Ejemplo n.º 3
0
 KOKKOS_INLINE_FUNCTION
 ValueType operator[]( const iType & i ) const
   {
     #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
       AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
       return  *(reinterpret_cast<ValueType*> (&v));
     #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
       AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
       return  *(reinterpret_cast<ValueType*> (&v));
     #else
       return m_alloc_ptr[ i + m_offset ];
     #endif
 }
Ejemplo n.º 4
0
      __device__ __inline__ void compute_detJ(int elethidx, const Scalar elemNodeCoords[Hex8::spatialDim],
          Scalar& detJ, const Scalar *gradients)
      {
        Scalar J0=0,J1=0,J2=0,J3=0,J4=0,J5=0,J6=0,J7=0,J8=0;
        Scalar grad_vals0, grad_vals1, grad_vals2;

#pragma unroll
        for(size_t i=0; i<Hex8::numNodesPerElem; ++i) {
          int texidx = elethidx*spatialDim*numNodesPerElem+i*spatialDim;

          grad_vals0=__ldg(gradients+texidx+0);
          grad_vals1=__ldg(gradients+texidx+1);
          grad_vals2=__ldg(gradients+texidx+2);

          Scalar elemNodeCoords0=elemNodeCoords[Hex8::spatialDim*i];
          Scalar elemNodeCoords1=elemNodeCoords[Hex8::spatialDim*i+1];
          Scalar elemNodeCoords2=elemNodeCoords[Hex8::spatialDim*i+2];

          J0 += grad_vals0*elemNodeCoords0;
          J1 += grad_vals0*elemNodeCoords1;
          J2 += grad_vals0*elemNodeCoords2;

          J3 += grad_vals1*elemNodeCoords0;
          J4 += grad_vals1*elemNodeCoords1;
          J5 += grad_vals1*elemNodeCoords2;

          J6 += grad_vals2*elemNodeCoords0;
          J7 += grad_vals2*elemNodeCoords1;
          J8 += grad_vals2*elemNodeCoords2;
        }

        Scalar term0 = J8*J4 - J7*J5;
        Scalar term1 = J8*J1 - J7*J2;
        Scalar term2 = J5*J1 - J4*J2;


        detJ = J0*term0 - J3*term1 + J6*term2;

      }
Ejemplo n.º 5
0
 KOKKOS_INLINE_FUNCTION
 ValueType operator[]( const iType & i ) const
   {
     AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_ptr[i]));
     return  *(reinterpret_cast<ValueType*> (&v));
   }
Ejemplo n.º 6
0
      __device__ void compute_detJ_invJ_grad_vals(int elethidx, const Scalar elemNodeCoords[Hex8::spatialDim],
          Scalar invJ_grad_vals[numNodesPerElem*spatialDim],
          Scalar& detJ, const Scalar *gradients)
      {
        Scalar J0=0,J1=0,J2=0,J3=0,J4=0,J5=0,J6=0,J7=0,J8=0;
        Scalar grad_vals0, grad_vals1, grad_vals2;
#pragma unroll
        for(size_t i=0; i<numNodesPerElem; ++i) {

          int texidx = elethidx*spatialDim*numNodesPerElem+i*spatialDim;
          grad_vals0=__ldg(gradients + texidx+0);
          grad_vals1=__ldg(gradients + texidx+1);
          grad_vals2=__ldg(gradients + texidx+2);

          Scalar elemNodeCoords0=elemNodeCoords[i*Hex8::spatialDim];
          Scalar elemNodeCoords1=elemNodeCoords[i*Hex8::spatialDim+1];
          Scalar elemNodeCoords2=elemNodeCoords[i*Hex8::spatialDim+2];

          J0 += grad_vals0*elemNodeCoords0;
          J1 += grad_vals0*elemNodeCoords1;
          J2 += grad_vals0*elemNodeCoords2;

          J3 += grad_vals1*elemNodeCoords0;
          J4 += grad_vals1*elemNodeCoords1;
          J5 += grad_vals1*elemNodeCoords2;

          J6 += grad_vals2*elemNodeCoords0;
          J7 += grad_vals2*elemNodeCoords1;
          J8 += grad_vals2*elemNodeCoords2;
        }

        Scalar term0 = J8*J4 - J7*J5;
        Scalar term1 = J8*J1 - J7*J2;
        Scalar term2 = J5*J1 - J4*J2;

        Scalar term3 = J8*J3 - J6*J5;
        Scalar term4 = J8*J0 - J6*J2;
        Scalar term5 = J5*J0 - J3*J2;

        Scalar term6 = J7*J3 - J6*J4;
        Scalar term7 = J7*J0 - J6*J1;
        Scalar term8 = J4*J0 - J3*J1;

        detJ = J0*term0 - J3*term1 + J6*term2;
        Scalar inv_detJ = 1.0/detJ;

        J0 =  term0*inv_detJ;
        J1 = -term1*inv_detJ;
        J2 =  term2*inv_detJ;

        J3 = -term3*inv_detJ;
        J4 =  term4*inv_detJ;
        J5 = -term5*inv_detJ;

        J6 =  term6*inv_detJ;
        J7 = -term7*inv_detJ;
        J8 =  term8*inv_detJ;

#pragma unroll
        for(int j=0; j<numNodesPerElem; ++j) {
          int texidx = elethidx*spatialDim*numNodesPerElem+j*spatialDim;
          Scalar gv0=__ldg(gradients+texidx+0);
          Scalar gv1=__ldg(gradients+texidx+1);
          Scalar gv2=__ldg(gradients+texidx+2);

          invJ_grad_vals[j*spatialDim+0] = J0 * gv0 + J1 * gv1 + J2 * gv2;
          invJ_grad_vals[j*spatialDim+1] = J3 * gv0 + J4 * gv1 + J5 * gv2;
          invJ_grad_vals[j*spatialDim+2] = J6 * gv0 + J7 * gv1 + J8 * gv2;
        }
      }