KOKKOS_INLINE_FUNCTION
    void OrthPolynomial<1>::generate( /**/  outputViewType output,
                                      const inputViewType input,
                                      const ordinal_type p ) {
      typedef outputViewType::value_type value_type;
      typedef Sacado::Fad::SFad<value_type,2> fad_type;

      constexpr ordinal_type maxCard = (maxOrder+1)*(maxOrder+2)/2;

      const ordinal_type
        npts = input.dimension(0),
        card = output.dimension(0);

      // use stack buffer
      fad_type inBuf[maxNumPts][2], outBuf[maxCard][maxNumPts];

      Kokkos::View<fad_type**, Kokkos::Impl::ActiveExecutionMemorySpace> in(&inBuf[0][0],            npts, 2);
      Kokkos::View<fad_type***,Kokkos::Impl::ActiveExecutionMemorySpace> out(&outBuf[0][0][0], card, npts);

      for (ordinal_type i=0;i<npts;++i)
        for (ordinal_type j=0;j<2;++j) {
          in(i,j) = Sacado::Fad::SFad<value_type,2>( input(i,j) );
          in(i,j).diff(j,2);
        }

      OrthPolynomial<0>::generate<maxOrder,maxNumPts>(out, in, p);

      for (ordinal_type i=0;i<card;++i)
        for (ordinal_type j=0;j<npts;++j)
          for (ordinal_type k=0;k<2;++k)
            output(i,j,k) = out(i,j).dx(k);
    }
    KOKKOS_INLINE_FUNCTION
    void
    Basis_HGRAD_LINE_Cn_FEM::Serial<opType>::
    getValues(       outputViewType output,
               const inputViewType  input,
                     workViewType   work,
               const vinvViewType   vinv,
               const ordinal_type   operatorDn ) {    
      ordinal_type opDn = operatorDn;

      const ordinal_type card = vinv.dimension(0);
      const ordinal_type npts = input.dimension(0);

      const ordinal_type order = card - 1;
      const double alpha = 0.0, beta = 0.0;

      typedef typename Kokkos::DynRankView<typename workViewType::value_type, typename workViewType::memory_space> viewType;
      auto vcprop = Kokkos::common_view_alloc_prop(work);
      
      switch (opType) {
      case OPERATOR_VALUE: {
        viewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts);     

        Impl::Basis_HGRAD_LINE_Cn_FEM_JACOBI::
          Serial<opType>::getValues(phis, input, order, alpha, beta);

        for (ordinal_type i=0;i<card;++i) 
          for (ordinal_type j=0;j<npts;++j) {
            output(i,j) = 0.0;
            for (ordinal_type k=0;k<card;++k)
              output(i,j) += vinv(k,i)*phis(k,j);
          }
        break;
      }
      case OPERATOR_GRAD:
      case OPERATOR_D1:
      case OPERATOR_D2:
      case OPERATOR_D3:
      case OPERATOR_D4:
      case OPERATOR_D5:
      case OPERATOR_D6:
      case OPERATOR_D7:
      case OPERATOR_D8:
      case OPERATOR_D9:
      case OPERATOR_D10: 
        opDn = getOperatorOrder(opType);
      case OPERATOR_Dn: {
        // dkcard is always 1 for 1D element
        const ordinal_type dkcard = 1;
        viewType phis(Kokkos::view_wrap(work.data(), vcprop), card, npts, dkcard);     
        Impl::Basis_HGRAD_LINE_Cn_FEM_JACOBI::
          Serial<opType>::getValues(phis, input, order, alpha, beta, opDn);

        for (ordinal_type i=0;i<card;++i) 
          for (ordinal_type j=0;j<npts;++j) 
            for (ordinal_type k=0;k<dkcard;++k) {
              output(i,j,k) = 0.0;
              for (ordinal_type l=0;l<card;++l)
                output(i,j,k) += vinv(l,i)*phis(l,j,k);
            }
        break;
      }
      default: {
        INTREPID2_TEST_FOR_ABORT( true,
                                  ">>> ERROR: (Intrepid2::Basis_HGRAD_LINE_Cn_FEM::Serial::getValues) operator is not supported." );
      }
      }
    }
    KOKKOS_INLINE_FUNCTION
    void OrthPolynomial<0>::generate( /**/  outputViewType output,
                                      const inputViewType input,
                                      const ordinal_type order ) {
      typedef outputViewType::value_type value_type;

      const ordinal_type
        npts = input.dimension(0);

      const auto z = input;

      // each point needs to be transformed from Pavel's element
      // z(i,0) --> (2.0 * z(i,0) - 1.0)
      // z(i,1) --> (2.0 * z(i,1) - 1.0)

      auto idx = [](const ordinal_type p,
                    const ordinal_type q) {
        return (p+q)*(p+q+1)/2+q;
      };

      auto jrc = [](const value_type alpha,
                    const value_type beta ,
                    const ordinal_type n ,
                    /**/  value_type  &an,
                    /**/  value_type  &bn,
                    /**/  value_type  &cn) {
        an = ( (2.0 * n + 1.0 + alpha + beta) * ( 2.0 * n + 2.0 + alpha + beta ) /
               (2.0 * ( n + 1 ) * ( n + 1 + alpha + beta ) ) );
        bn = ( (alpha*alpha-beta*beta)*(2.0*n+1.0+alpha+beta) /
               (2.0*(n+1.0)*(2.0*n+alpha+beta)*(n+1.0+alpha+beta) ) );
        cn = ( (n+alpha)*(n+beta)*(2.0*n+2.0+alpha+beta) /
               ( (n+1.0)*(n+1.0+alpha+beta)*(2.0*n+alpha+beta) ) );
      }

      // set D^{0,0} = 1.0
      {
        const ordinal_type loc = idx(0,0);
        for (ordinal_type i=0;i<npts;++i)
          output(loc, i) = 1.0 + z(i,0) - z(i,0) + z(i,1) - z(i,1);
      }

      if (p > 0) {
        value_type f1[maxNumPts],f2[maxNumPts],f3[maxNumPts];

        for (ordinal_type i=0;i<npts;++i) {
          f1[i] = 0.5 * (1.0+2.0*(2.0*z(i,0)-1.0)+(2.0*z(i,1)-1.0));
          f2[i] = 0.5 * (1.0-(2.0*z(i,1)-1.0));
          f3[i] = f2[i] * f2[i];
        }

        // set D^{1,0} = f1
        {
          const ordinal_type loc = idx(1,0);
          for (ordinal_type i=0;i<npts;++i)
            output(loc, i) = f1[i];
        }

        // recurrence in p
        for (ordinal_type p=1;p<order;p++) {
          const ordinal_type
            loc = idx(p,0),
            loc_p1 = idx(p+1,0),
            loc_m1 = idx(p-1,0);

          const value_type
            a = (2.0*p+1.0)/(1.0+p),
            b = p / (p+1.0);

          for (ordinal_type i=0;i<npts;++i)
            output(loc_p1,i) = ( a * f1[i] * output(loc,i) -
                                 b * f3[i] * output(loc_m1,i) );
        }

        // D^{p,1}
        for (ordinal_type p=0;p<order;++p) {
          const ordinal_type
            loc_p_0 = idx(p,0),
            loc_p_1 = idx(p,1);

          for (ordinal_type i=0;i<npts;++i)
            output(loc_p_1,i) = output(loc_p_0,i)*0.5*(1.0+2.0*p+(3.0+2.0*p)*(2.0*z(i,1)-1.0));
        }


        // recurrence in q
        for (ordinal_type p=0;p<order-1;++p)
          for (ordinal_type q=1;q<order-p;++q) {
            const ordinal_type
              loc_p_qp1 = idx(p,q+1),
              loc_p_q = idx(p,q),
              loc_p_qm1 = idx(p,q-1);

            value_type a,b,c;
            jrc((value_type)(2*p+1),(value_type)0,q,a,b,c);
            for (ordinal_type i=0;i<npts;++i)
              output(loc_p_qp1,i) = ( (a*(2.0*z(i,1)-1.0)+b)*outputValues(loc_p_q,i)
                                      - c*outputValues(loc_p_qm1,i) );
          }
      }
      
      // orthogonalize
      for (ordinal_type p=0;p<=order;++p)
        for (ordinal_type q=0;q<=order-p;++q)
          for (ordinal_type i=0;i<npts;++i)
            output(idx(p,q),i) *= sqrt( (p+0.5)*(p+q+1.0));
    }