Example #1
0
Real
SolidWallBC::computeQpOffDiagJacobian(unsigned jvar)
{
  RealVectorValue hU(0., 0., 0.);  
  if (jvar == _hu_var)
  {
    switch (_equ_type)
    {
      case y_mom:
        return _eos.dp_dhu(_h[_qp], hU)*_normals[_qp](1)*_test[_i][_qp];
        break;
      default:
        return 0.;
        break;
    }
  }
  else if (jvar == _hv_var)
  {
    switch (_equ_type)
    {
      case x_mom:
        return _eos.dp_dhv(_h[_qp], hU)*_normals[_qp](0)*_test[_i][_qp];
        break;
      default:
        return 0.;
        break;
    }
  }
  else
    return 0.;
}
Real
SaintVenantSetWaterVelocity::computeQpJacobian()
{
    RealVectorValue hU(_h[_qp]*_u_bc, 0., 0.);
    Real dpdhu = _eos.dp_dhu(_h[_qp], hU);
    switch (_equ_type)
    {
    case continuity:
        return _phi[_j][_qp]*_u_bc*_normals[_qp](0)*_test[_i][_qp];
        break;
    case x_mom:
        return _phi[_j][_qp]*dpdhu*_normals[_qp](0)*_test[_i][_qp];
        break;
    default:
        mooseError("'" << this->name() << "' Invalid equation name.");
    }
}
Real
SaintVenantSetWaterVelocity::computeQpResidual()
{
    RealVectorValue hU(_h[_qp]*_u_bc, 0., 0.);
    Real p = _eos.pressure(_h[_qp], hU);
    switch (_equ_type)
    {
    case continuity:
        return _h[_qp]*_u_bc*_normals[_qp](0)*_test[_i][_qp];
        break;
    case x_mom:
        return (_u_bc*_u_bc*_h[_qp]+p)*_normals[_qp](0)*_test[_i][_qp];
        break;
    default:
        mooseError("'" << this->name() << "' Invalid equation name.");
    }
}
Example #4
0
Real
SolidWallBC::computeQpJacobian()
{
  RealVectorValue hU(0., 0., 0.);
  switch (_equ_type)
  {
    case continuity:
      return 0.;
      break;
    case x_mom:
      return _eos.dp_dhu(_h[_qp], hU)*_normals[_qp](0)*_test[_i][_qp];
      break;
    case y_mom:
      return _eos.dp_dhv(_h[_qp], hU)*_normals[_qp](1)*_test[_i][_qp];
      break;
    default:
      mooseError("'" << this->name() << "' Invalid equation name.");
  }
}
Example #5
0
Real
SolidWallBC::computeQpResidual()
{
  RealVectorValue hU(0., 0., 0.);
  Real p = _eos.pressure(_h[_qp], hU);
  switch (_equ_type)
  {
    case continuity:
      return 0.;
      break;
    case x_mom:
      return p*_normals[_qp](0)*_test[_i][_qp];
    case y_mom:
      return p*_normals[_qp](1)*_test[_i][_qp];
      break;
    default:
      mooseError("'" << this->name() << "' Invalid equation name.");
  }
}
Real
SaintVenantSetWaterVelocity::computeQpOffDiagJacobian(unsigned jvar)
{
    RealVectorValue hU(_h[_qp]*_u_bc, 0., 0.);
    Real dpdu(0.0);
    if (jvar == _h_var)
    {
        switch (_equ_type)
        {
        case x_mom:
            dpdu = _eos.dp_dh(_h[_qp], hU);
            return _phi[_j][_qp]*dpdu*_normals[_qp](0)*_test[_i][_qp];
            break;
        default:
            mooseError("'" << this->name() << "' Invalid equation name.");
        }
    }
    else
        return 0.;
}
Example #7
0
// static
void GSparseMatrix::test()
{
	// Make the data
	GSparseMatrix sm(4, 4);
	sm.set(0, 0, 2.0); sm.set(0, 2, 3.0);
	sm.set(1, 0, 1.0); sm.set(2, 3, -2.0);
	sm.set(2, 2, 5.0);
	sm.set(3, 1, -3.0); sm.set(3, 3, -1.0);
	GData* fm = sm.toFullMatrix();
	Holder<GData> hFM(fm);

	// Do it with the full matrix
	GData* pU;
	double* pDiag;
	GData* pV;
	fm->singularValueDecomposition(&pU, &pDiag, &pV);
	Holder<GData> hU(pU);
	ArrayHolder<double> hDiag(pDiag);
	Holder<GData> hV(pV);

	// Do it with the sparse matrix
	GSparseMatrix* pSU;
	double* pSDiag;
	GSparseMatrix* pSV;
	sm.singularValueDecomposition(&pSU, &pSDiag, &pSV);
	Holder<GSparseMatrix> hSU(pSU);
	ArrayHolder<double> hSDiag(pSDiag);
	Holder<GSparseMatrix> hSV(pSV);

	// Check the results
	GData* pV2 = pSV->toFullMatrix();
	Holder<GData> hV2(pV2);
	double err = pV2->sumSquaredDifference(*pV, false);
	if(err > 1e-6)
		ThrowError("Failed");
}
void
PressureBasedViscosityCoefficient::computeQpProperties()
{
  // Cell size
  Real h_cell = _current_elem->hmin();

  // Speed of sound
  RealVectorValue hU(_hu[_qp], 0., 0.);
  Real c2=_eos.c2(_h[_qp], hU);

  // Compute normalization parameter
  Real eps = std::sqrt(std::numeric_limits<Real>::min());
  Real norm = 0.;
  switch (_pbv_type)
  {
    case JST:
      norm = std::fabs(_press[_qp]);
      break;
    case HMP:
      norm = h_cell*_press_grad[_qp].size()+eps;
      break;
    case ST:
      norm = 0.5*(h_cell*_press_grad[_qp].size()+std::fabs(_press[_qp]));
      break;      
    default:
      mooseError("'" << this->name() << "': invalid viscosity type. The options are: JST, HMP and ST");
      break;
  }

  // Pre-compute the viscosity coefficient
  Real kappa = (std::fabs(_norm_vel[_qp])+std::sqrt(c2))*std::fabs(_press_laplace[_qp])/norm;

  // Set material viscosity coefficient
  _kappa_max[_qp] = 0.5*h_cell*(hU.size()/_h[_qp]+std::sqrt(c2));
  _kappa[_qp] = std::min(_Ce*h_cell*h_cell*h_cell*kappa, _kappa_max[_qp]);
}
Example #9
0
/**
    Purpose
    -------
    ZSSSSM applies the LU factorization update from a complex
    matrix formed by a lower triangular IB-by-K tile L1 on top of a
    M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1
    tile A1 on top of a M2-by-N2 tile A2 (N1 == N2).

    This is the right-looking Level 2.5 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in]
    ib      INTEGER
            The inner-blocking size.  IB >= 0.

    @param[in]
    NB      INTEGER
            The blocking size.  NB >= 0.

    @param[in,out]
    hU      COMPLEX_16 array, dimension(LDHU, N), on cpu.
            On entry, the NB-by-N upper triangular tile hU.
            On exit, the content is incomplete. Shouldn't be used.

    @param[in]
    ldhu    INTEGER
            The leading dimension of the array hU.  LDHU >= max(1,NB).

    @param[in,out]
    dU      COMPLEX_16 array, dimension(LDDU, N), on gpu.
            On entry, the NB-by-N upper triangular tile dU identical to hU.
            On exit, the new factor U from the factorization.

    @param[in]
    lddu    INTEGER
            The leading dimension of the array dU.  LDDU >= max(1,NB).

    @param[in,out]
    hA      COMPLEX_16 array, dimension(LDHA, N), on cpu.
            On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB).
            On exit, the content is incomplete. Shouldn't be used.

    @param[in]
    ldha    INTEGER
            The leading dimension of the array hA.  LDHA >= max(1,M).

    @param[in,out]
    dA      COMPLEX_16 array, dimension(LDDA, N), on gpu.
            On entry, the M-by-N tile to be factored.
            On exit, the factor L from the factorization

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    hL      COMPLEX_16 array, dimension(LDHL, K), on vpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-K the inverse of the top part.

    @param[in]
    ldhl    INTEGER
            The leading dimension of the array hL.  LDHL >= max(1,2*IB).

    @param[out]
    dL      COMPLEX_16 array, dimension(LDDL, K), on gpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-K the inverse of the top part.

    @param[in]
    lddl    INTEGER
            The leading dimension of the array dL.  LDDL >= max(1,2*IB).

    @param[out]
    hWORK   COMPLEX_16 array, dimension(LDHWORK, 2*IB), on cpu.
            Workspace.

    @param[in]
    ldhwork INTEGER
            The leading dimension of the array hWORK.  LDHWORK >= max(NB, 1).

    @param[out]
    dWORK   COMPLEX_16 array, dimension(LDDWORK, 2*IB), on gpu.
            Workspace.

    @param[in]
    lddwork INTEGER
            The leading dimension of the array dWORK.  LDDWORK >= max(NB, 1).

    @param[out]
    ipiv    INTEGER array on the cpu.
            The pivot indices array of size K as returned by ZTSTRF

    @param[out]
    info    INTEGER
            - PLASMA_SUCCESS successful exit
            - < 0 if INFO = -k, the k-th argument had an illegal value
            - > 0 if INFO = k, U(k,k) is exactly zero. The factorization
                has been completed, but the factor U is exactly
                singular, and division by zero will occur if it is used
                to solve a system of equations.

    @ingroup magma_zgesv_tile
    ********************************************************************/
extern "C" magma_int_t
magma_ztstrf_gpu(
    magma_order_t order, magma_int_t m, magma_int_t n,
    magma_int_t ib, magma_int_t nb,
    magmaDoubleComplex    *hU, magma_int_t ldhu,
    magmaDoubleComplex_ptr dU, magma_int_t lddu,
    magmaDoubleComplex    *hA, magma_int_t ldha,
    magmaDoubleComplex_ptr dA, magma_int_t ldda,
    magmaDoubleComplex    *hL, magma_int_t ldhl,
    magmaDoubleComplex_ptr dL, magma_int_t lddl,
    magma_int_t *ipiv,
    magmaDoubleComplex    *hwork, magma_int_t ldhwork,
    magmaDoubleComplex_ptr dwork, magma_int_t lddwork,
    magma_int_t *info)
{
#define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib )
#define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib )
#define L(i)    (dL  + (i)*ib*lddl          )
#define L2(i)   (dL2 + (i)*ib*lddl          )
#define hU(i,j) (hU  + (j)*ib*ldhu + (i)*ib )
#define hA(i,j) (hA  + (j)*ib*ldha + (i)*ib )
#define hL(i)   (hL  + (i)*ib*ldhl          )
#define hL2(i)  (hL2 + (i)*ib*ldhl          )

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    int iinfo = 0;
    int maxm, mindim;
    int i, j, im, s, ip, ii, sb, p = 1;
    magmaDoubleComplex_ptr dAT, dUT;
    magmaDoubleComplex_ptr dAp, dUp;
#ifndef WITHOUTTRTRI
    magmaDoubleComplex_ptr dL2 = dL + ib;
    magmaDoubleComplex *hL2 = hL + ib;
    p = 2;
#endif

    /* Check input arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    }
    else if (n < 0) {
        *info = -2;
    }
    else if (ib < 0) {
        *info = -3;
    }
    else if ((lddu < max(1,m)) && (m > 0)) {
        *info = -6;
    }
    else if ((ldda < max(1,m)) && (m > 0)) {
        *info = -8;
    }
    else if ((lddl < max(1,ib)) && (ib > 0)) {
        *info = -10;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* quick return */
    if ((m == 0) || (n == 0) || (ib == 0))
        return *info;

    ip = 0;

    /* Function Body */
    mindim = min(m, n);
    s      = mindim / ib;

    if ( ib >= mindim ) {
        /* Use CPU code. */
        CORE_ztstrf(m, n, ib, nb,
                    (PLASMA_Complex64_t*)hU, ldhu,
                    (PLASMA_Complex64_t*)hA, ldha,
                    (PLASMA_Complex64_t*)hL, ldhl,
                    ipiv,
                    (PLASMA_Complex64_t*)hwork, ldhwork,
                    info);

#ifndef WITHOUTTRTRI
        CORE_zlacpy( PlasmaUpperLower, mindim, mindim,
                     (PLASMA_Complex64_t*)hL, ldhl,
                     (PLASMA_Complex64_t*)hL2, ldhl );
        CORE_ztrtri( PlasmaLower, PlasmaUnit, mindim,
                     (PLASMA_Complex64_t*)hL2, ldhl, info );
        if (*info != 0 ) {
            fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
        }
#endif

        if ( order == MagmaRowMajor ) {
            magma_zsetmatrix( m, n, hU, ldhu, dwork, lddwork );
            magmablas_ztranspose( m, n, dwork, lddwork, dU, lddu );

            magma_zsetmatrix( m, n, hA, ldha, dwork, lddwork );
            magmablas_ztranspose( m, n, dwork, lddwork, dA, ldda );
        } else {
            magma_zsetmatrix( m, n, hU, ldhu, dU, lddu );
            magma_zsetmatrix( m, n, hA, ldha, dA, ldda );
        }
        magma_zsetmatrix( p*ib, n, hL, ldhl, dL, lddl );
    }
    else {
        /* Use hybrid blocked code. */
        maxm = magma_roundup( m, 32 );

        if ( order == MagmaColMajor ) {
            magmablas_zgetmo_in( dU, dUT, lddu, m,  n );
            magmablas_zgetmo_in( dA, dAT, ldda, m,  n );
        } else {
            dUT = dU; dAT = dA;
        }
        dAp = dwork;
        dUp = dAp + ib*lddwork;

        ip = 0;
        for( i=0; i < s; i++ ) {
            ii = i * ib;
            sb = min(mindim-ii, ib);
            
            if ( i > 0 ) {
                // download i-th panel
                magmablas_ztranspose( sb, ii, UT(0,i), lddu, dUp, lddu );
                magmablas_ztranspose( sb, m,  AT(0,i), ldda, dAp, ldda );
                
                magma_zgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu );
                magma_zgetmatrix( m,  sb, dAp, ldda, hA(0, i), ldha );
                
                // make sure that gpu queue is empty
                //magma_device_sync();
                
#ifndef WITHOUTTRTRI
                magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-(ii+sb), ib,
                             c_one, L2(i-1),      lddl,
                                    UT(i-1, i+1), lddu);
#else
                magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-(ii+sb), ib,
                             c_one, L(i-1),       lddl,
                                    UT(i-1, i+1), lddu);
#endif
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(ii+sb), m, ib,
                             c_neg_one, UT(i-1, i+1), lddu,
                                        AT(0,   i-1), ldda,
                             c_one,     AT(0,   i+1), ldda );
            }

            // do the cpu part
            CORE_ztstrf(m, sb, ib, nb,
                        (PLASMA_Complex64_t*)hU(i, i), ldhu,
                        (PLASMA_Complex64_t*)hA(0, i), ldha,
                        (PLASMA_Complex64_t*)hL(i),    ldhl,
                        ipiv+ii,
                        (PLASMA_Complex64_t*)hwork, ldhwork,
                        info);

            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + ii;
            
            // Need to swap betw U and A
#ifndef NOSWAPBLK
            magmablas_zswapblk( MagmaRowMajor, n-(ii+sb),
                                UT(i, i+1), lddu,
                                AT(0, i+1), ldda,
                                1, sb, ipiv+ii, 1, nb );

            for (j=0; j < ib; j++) {
                im = ipiv[ip]-1;
                if ( im == j ) {
                    ipiv[ip] += ii;
                }
                ip++;
            }
#else
            for (j=0; j < ib; j++) {
                im = ipiv[ip]-1;
                if ( im != (j) ) {
                    im = im - nb;
                    assert( (im >= 0) && (im < m) );
                    magmablas_zswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 );
                } else {
                    ipiv[ip] += ii;
                }
                ip++;
            }
#endif

#ifndef WITHOUTTRTRI
            CORE_zlacpy( PlasmaUpperLower, sb, sb,
                         (PLASMA_Complex64_t*)hL(i), ldhl,
                         (PLASMA_Complex64_t*)hL2(i), ldhl );
            CORE_ztrtri( PlasmaLower, PlasmaUnit, sb,
                         (PLASMA_Complex64_t*)hL2(i), ldhl, info );
            if (*info != 0 ) {
                fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
            }
#endif
            // upload i-th panel
            magma_zsetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu );
            magma_zsetmatrix( m, sb, hA(0, i), ldha, dAp, ldda );
            magma_zsetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl );
            magmablas_ztranspose( sb, sb, dUp, lddu, UT(i,i), lddu );
            magmablas_ztranspose( m,  sb, dAp, ldda, AT(0,i), ldda );
            
            // make sure that gpu queue is empty
            //magma_device_sync();
            
            // do the small non-parallel computations
            if ( s > (i+1) ) {
#ifndef WITHOUTTRTRI
                magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             sb, sb,
                             c_one, L2(i),      lddl,
                                    UT(i, i+1), lddu);
#else
                magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             sb, sb,
                             c_one, L(i),      lddl,
                                    UT(i, i+1), lddu);
#endif
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             sb, m, sb,
                             c_neg_one, UT(i, i+1), lddu,
                                        AT(0, i  ), ldda,
                             c_one,     AT(0, i+1), ldda );
            }
            else {
#ifndef WITHOUTTRTRI
                magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, L2(i),      lddl,
                                    UT(i, i+1), lddu);
#else
                magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, L(i),      lddl,
                                    UT(i, i+1), lddu);
#endif
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-mindim, m, sb,
                             c_neg_one, UT(i, i+1), lddu,
                                        AT(0, i  ), ldda,
                             c_one,     AT(0, i+1), ldda );
            }
        }

        if ( order == MagmaColMajor ) {
            magmablas_zgetmo_out( dU, dUT, lddu, m,  n );
            magmablas_zgetmo_out( dA, dAT, ldda, m,  n );
        }
    }
    return *info;
}
Example #10
0
extern "C" magma_int_t
magma_ctstrf_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb,
                  magmaFloatComplex *hU, magma_int_t ldhu, magmaFloatComplex *dU, magma_int_t lddu,
                  magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda,
                  magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl,
                  magma_int_t *ipiv,
                  magmaFloatComplex *hwork, magma_int_t ldhwork, magmaFloatComplex *dwork, magma_int_t lddwork,
                  magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    CSSSSM applies the LU factorization update from a complex
    matrix formed by a lower triangular IB-by-K tile L1 on top of a
    M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1
    tile A1 on top of a M2-by-N2 tile A2 (N1 == N2).

    This is the right-looking Level 2.5 BLAS version of the algorithm.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    IB      (input) INTEGER
            The inner-blocking size.  IB >= 0.

    NB      (input) INTEGER
            The blocking size.  NB >= 0.

    hU      (input,output) COMPLEX array, dimension(LDHU, N), on cpu.
            On entry, the NB-by-N upper triangular tile hU.
            On exit, the content is incomplete. Shouldn't be used.

    LDHU    (input) INTEGER
            The leading dimension of the array hU.  LDHU >= max(1,NB).

    dU      (input,output) COMPLEX array, dimension(LDDU, N), on gpu.
            On entry, the NB-by-N upper triangular tile dU identical to hU.
            On exit, the new factor U from the factorization.

    LDDU    (input) INTEGER
            The leading dimension of the array dU.  LDDU >= max(1,NB).

    hA      (input,output) COMPLEX array, dimension(LDHA, N), on cpu.
            On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB).
            On exit, the content is incomplete. Shouldn't be used.

    LDHA    (input) INTEGER
            The leading dimension of the array hA.  LDHA >= max(1,M).

    dA      (input,output) COMPLEX array, dimension(LDDA, N) , on gpu.
            On entry, the M-by-N tile to be factored.
            On exit, the factor L from the factorization

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    hL      (output) COMPLEX array, dimension(LDHL, K), on vpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-K the inverse of the top part.

    LDHL    (input) INTEGER
            The leading dimension of the array hL.  LDHL >= max(1,2*IB).

    dL      (output) COMPLEX array, dimension(LDDL, K), on gpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-K the inverse of the top part.

    LDDL    (input) INTEGER
            The leading dimension of the array dL.  LDDL >= max(1,2*IB).

    hWORK   (output) COMPLEX array, dimension(LDHWORK, 2*IB), on cpu.
            Workspace.

    LDHWORK (input) INTEGER
            The leading dimension of the array hWORK.  LDHWORK >= max(NB, 1).

    dWORK   (output) COMPLEX array, dimension(LDDWORK, 2*IB), on gpu.
            Workspace.

    LDDWORK (input) INTEGER
            The leading dimension of the array dWORK.  LDDWORK >= max(NB, 1).

    IPIV    (output) INTEGER array on the cpu.
            The pivot indices array of size K as returned by CTSTRF

    INFO    (output) INTEGER
            - PLASMA_SUCCESS successful exit
            - < 0 if INFO = -k, the k-th argument had an illegal value
            - > 0 if INFO = k, U(k,k) is exactly zero. The factorization
                has been completed, but the factor U is exactly
                singular, and division by zero will occur if it is used
                to solve a system of equations.

    =====================================================================    */

#define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib )
#define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib )
#define L(i)    (dL  + (i)*ib*lddl          )
#define L2(i)   (dL2 + (i)*ib*lddl          )
#define hU(i,j) (hU  + (j)*ib*ldhu + (i)*ib )
#define hA(i,j) (hA  + (j)*ib*ldha + (i)*ib )
#define hL(i)   (hL  + (i)*ib*ldhl          )
#define hL2(i)  (hL2 + (i)*ib*ldhl          )

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    int iinfo = 0;
    int maxm, mindim;
    int i, j, im, s, ip, ii, sb, p = 1;
    magmaFloatComplex *dAT, *dUT;
    magmaFloatComplex *dAp, *dUp;
#ifndef WITHOUTTRTRI
    magmaFloatComplex *dL2 = dL + ib;
    magmaFloatComplex *hL2 = hL + ib;
    p = 2;
#endif

    /* Check input arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    }
    else if (n < 0) {
        *info = -2;
    }
    else if (ib < 0) {
        *info = -3;
    }
    else if ((lddu < max(1,m)) && (m > 0)) {
        *info = -6;
    }
    else if ((ldda < max(1,m)) && (m > 0)) {
        *info = -8;
    }
    else if ((lddl < max(1,ib)) && (ib > 0)) {
        *info = -10;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* quick return */
    if ((m == 0) || (n == 0) || (ib == 0))
        return *info;

    ip = 0;

    /* Function Body */
    mindim = min(m, n);
    s      = mindim / ib;

    if ( ib >= mindim ) {
        /* Use CPU code. */
        CORE_ctstrf(m, n, ib, nb,
                    (PLASMA_Complex32_t*)hU, ldhu,
                    (PLASMA_Complex32_t*)hA, ldha,
                    (PLASMA_Complex32_t*)hL, ldhl,
                    ipiv,
                    (PLASMA_Complex32_t*)hwork, ldhwork,
                    info);

#ifndef WITHOUTTRTRI
        CORE_clacpy( PlasmaUpperLower, mindim, mindim,
                     (PLASMA_Complex32_t*)hL, ldhl,
                     (PLASMA_Complex32_t*)hL2, ldhl );
        CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim,
                     (PLASMA_Complex32_t*)hL2, ldhl, info );
        if (*info != 0 ) {
            fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
        }
#endif

        if ( (storev == 'R') || (storev == 'r') ) {
            magma_csetmatrix( m, n, hU, ldhu, dwork, lddwork );
            magmablas_ctranspose( dU, lddu, dwork, lddwork, m, n );

            magma_csetmatrix( m, n, hA, ldha, dwork, lddwork );
            magmablas_ctranspose( dA, ldda, dwork, lddwork, m, n );
        } else {
            magma_csetmatrix( m, n, hU, ldhu, dU, lddu );
            magma_csetmatrix( m, n, hA, ldha, dA, ldda );
        }
        magma_csetmatrix( p*ib, n, hL, ldhl, dL, lddl );
            
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;

        if ( (storev == 'C') || (storev == 'c') ) {
            magmablas_cgetmo_in( dU, dUT, lddu, m,  n );
            magmablas_cgetmo_in( dA, dAT, ldda, m,  n );
        } else {
            dUT = dU; dAT = dA;
        }
        dAp = dwork;
        dUp = dAp + ib*lddwork;

        ip = 0;
        for( i=0; i<s; i++ )
        {
            ii = i * ib;
            sb = min(mindim-ii, ib);
            
            if ( i>0 ){
                // download i-th panel
                magmablas_ctranspose( dUp, lddu, UT(0, i), lddu, sb, ii );
                magmablas_ctranspose( dAp, ldda, AT(0, i), ldda, sb, m  );
                
                magma_cgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu );
                magma_cgetmatrix( m, sb, dAp, ldda, hA(0, i), ldha );
                
                // make sure that gpu queue is empty
                //magma_device_sync();
                
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-(ii+sb), ib,
                             c_one, L2(i-1),      lddl,
                                    UT(i-1, i+1), lddu);
#else
                magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-(ii+sb), ib,
                             c_one, L(i-1),       lddl,
                                    UT(i-1, i+1), lddu);
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(ii+sb), m, ib,
                             c_neg_one, UT(i-1, i+1), lddu,
                                        AT(0,   i-1), ldda,
                             c_one,     AT(0,   i+1), ldda );
            }

            // do the cpu part
            CORE_ctstrf(m, sb, ib, nb,
                        (PLASMA_Complex32_t*)hU(i, i), ldhu,
                        (PLASMA_Complex32_t*)hA(0, i), ldha,
                        (PLASMA_Complex32_t*)hL(i),    ldhl,
                        ipiv+ii,
                        (PLASMA_Complex32_t*)hwork, ldhwork,
                        info);

            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + ii;
            
            // Need to swap betw U and A
#ifndef NOSWAPBLK
            magmablas_cswapblk( 'R', n-(ii+sb),
                                UT(i, i+1), lddu,
                                AT(0, i+1), ldda,
                                1, sb, ipiv+ii, 1, nb );

            for(j=0; j<ib; j++) {
                im = ipiv[ip]-1;
                if ( im == j ) {
                    ipiv[ip] += ii;
                }
                ip++;
            }
#else
            for(j=0; j<ib; j++) {
                im = ipiv[ip]-1;
                if ( im != (j) ) {
                    im = im - nb;
                    assert( (im>=0) && (im<m) );
                    magmablas_cswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 );
                } else {
                    ipiv[ip] += ii;
                }
                ip++;
            }
#endif

#ifndef WITHOUTTRTRI
            CORE_clacpy( PlasmaUpperLower, sb, sb,
                         (PLASMA_Complex32_t*)hL(i), ldhl,
                         (PLASMA_Complex32_t*)hL2(i), ldhl );
            CORE_ctrtri( PlasmaLower, PlasmaUnit, sb,
                         (PLASMA_Complex32_t*)hL2(i), ldhl, info );
            if (*info != 0 ) {
                fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
            }
#endif
            // upload i-th panel
            magma_csetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu );
            magma_csetmatrix( m, sb, hA(0, i), ldha, dAp, ldda );
            magma_csetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl );
            magmablas_ctranspose( UT(i, i), lddu, dUp, lddu, sb, sb);
            magmablas_ctranspose( AT(0, i), ldda, dAp, ldda, m,  sb);
            
            // make sure that gpu queue is empty
            //magma_device_sync();
            
            // do the small non-parallel computations
            if ( s > (i+1) ) {
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             sb, sb,
                             c_one, L2(i),      lddl,
                                    UT(i, i+1), lddu);
#else
                magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             sb, sb,
                             c_one, L(i),      lddl,
                                    UT(i, i+1), lddu);
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             sb, m, sb,
                             c_neg_one, UT(i, i+1), lddu,
                                        AT(0, i  ), ldda,
                             c_one,     AT(0, i+1), ldda );
            }
            else {
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, L2(i),      lddl,
                                    UT(i, i+1), lddu);
#else
                magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, L(i),      lddl,
                                    UT(i, i+1), lddu);
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-mindim, m, sb,
                             c_neg_one, UT(i, i+1), lddu,
                                        AT(0, i  ), ldda,
                             c_one,     AT(0, i+1), ldda );
            }
        }

        if ( (storev == 'C') || (storev == 'c') ) {
            magmablas_cgetmo_out( dU, dUT, lddu, m,  n );
            magmablas_cgetmo_out( dA, dAT, ldda, m,  n );
        }
    }
    return *info;
}
Example #11
0
void GSparseMatrix::singularValueDecompositionHelper(GSparseMatrix** ppU, double** ppDiag, GSparseMatrix** ppV, int maxIters)
{
	int m = rows();
	int n = cols();
	if(m < n)
		ThrowError("Expected at least as many rows as columns");
	int i, j, k;
	int l = 0;
	int q, iter;
	double c, f, h, s, x, y, z;
	double norm = 0.0;
	double g = 0.0;
	double scale = 0.0;
	GSparseMatrix* pU = new GSparseMatrix(m, m);
	Holder<GSparseMatrix> hU(pU);
	pU->copyFrom(this);
	double* pSigma = new double[n];
	ArrayHolder<double> hSigma(pSigma);
	GSparseMatrix* pV = new GSparseMatrix(n, n);
	Holder<GSparseMatrix> hV(pV);
	GTEMPBUF(double, temp, n + m);
	double* temp2 = temp + n;

	// Householder reduction to bidiagonal form
	for(int i = 0; i < n; i++)
	{
		// Left-hand reduction
		temp[i] = scale * g;
		l = i + 1;
		g = 0.0;
		s = 0.0;
		scale = 0.0;
		if(i < m)
		{
			Iter kend = pU->colEnd(i);
			for(Iter kk = pU->colBegin(i); kk != kend; kk++)
			{
				if(kk->first >= (unsigned int)i)
					scale += ABS(kk->second);
			}
			if(scale != 0.0)
			{
				for(Iter kk = pU->colBegin(i); kk != kend; kk++)
				{
					if(kk->first >= (unsigned int)i)
					{
						double t = kk->second / scale;
						pU->set(kk->first, i, t);
						s += (t * t);
					}
				}
				f = pU->get(i, i);
				g = -GSparseMatrix_takeSign(sqrt(s), f);
				h = f * g - s;
				pU->set(i, i, f - g);
				if(i != n - 1)
				{
					for(j = l; j < n; j++)
					{
						s = 0.0;
						for(Iter kk = pU->colBegin(i); kk != kend; kk++)
						{
							if(kk->first >= (unsigned int)i)
								s += kk->second * pU->get(kk->first, j);
						}
						f = s / h;
						for(Iter kk = pU->colBegin(i); kk != kend; kk++)
						{
							if(kk->first >= (unsigned int)i)
								pU->set(kk->first, j, pU->get(kk->first, j) + f * kk->second);
						}
					}
				}
				for(Iter kk = pU->colBegin(i); kk != kend; kk++)
				{
					if(kk->first >= (unsigned int)i)
						pU->set(kk->first, i, pU->get(kk->first, i) * scale);
				}
			}
		}
		pSigma[i] = scale * g;

		// Right-hand reduction
		g = 0.0;
		s = 0.0;
		scale = 0.0;
		if(i < m && i != n - 1) 
		{
			Iter kend = pU->rowEnd(i);
			for(Iter kk = pU->rowBegin(i); kk != kend; kk++)
			{
				if(kk->first >= (unsigned int)n)
					break;
				if(kk->first >= (unsigned int)l)
					scale += ABS(kk->second);
			}
			if(scale != 0.0) 
			{
				for(Iter kk = pU->rowBegin(i); kk != kend; kk++)
				{
					if(kk->first >= (unsigned int)n)
						break;
					if(kk->first >= (unsigned int)l)
					{
						double t = kk->second / scale;
						pU->set(i, kk->first, t);
						s += (t * t);
					}
				}
				f = pU->get(i, l);
				g = -GSparseMatrix_takeSign(sqrt(s), f);
				h = f * g - s;
				pU->set(i, l, f - g);
				for(k = l; k < n; k++)
					temp[k] = pU->get(i, k) / h;
				if(i != m - 1) 
				{
					for(j = l; j < m; j++) 
					{
						s = 0.0;
						for(Iter kk = pU->rowBegin(i); kk != kend; kk++)
						{
							if(kk->first >= (unsigned int)n)
								break;
							if(kk->first >= (unsigned int)l)
								s += pU->get(j, kk->first) * kk->second;
						}
						Iter kend2 = pU->rowEnd(j);
						for(Iter kk = pU->rowBegin(j); kk != kend2; kk++)
						{
							if(kk->first >= (unsigned int)n)
								break;
							if(kk->first >= (unsigned int)l)
								pU->set(j, kk->first, pU->get(j, kk->first) + s * temp[kk->first]);
						}
					}
				}
				for(Iter kk = pU->rowBegin(i); kk != kend; kk++)
				{
					if(kk->first >= (unsigned int)n)
						break;
					if(kk->first >= (unsigned int)l)
						pU->set(i, kk->first, kk->second * scale);
				}
			}
		}
		norm = MAX(norm, ABS(pSigma[i]) + ABS(temp[i]));
	}

	// Accumulate right-hand transform
	for(int i = n - 1; i >= 0; i--)
	{
		if(i < n - 1)
		{
			if(g != 0.0)
			{
				Iter jend = pU->rowEnd(i);
				for(Iter jj = pU->rowBegin(i); jj != jend; jj++)
				{
					if(jj->first >= (unsigned int)n)
						break;
					if(jj->first >= (unsigned int)l)
						pV->set(i, jj->first, (jj->second / pU->get(i, l)) / g); // (double-division to avoid underflow)
				}
				for(j = l; j < n; j++)
				{
					s = 0.0;
					Iter kend = pU->rowEnd(i);
					for(Iter kk = pU->rowBegin(i); kk != kend; kk++)
					{
						if(kk->first >= (unsigned int)n)
							break;
						if(kk->first >= (unsigned int)l)
							s += kk->second * pV->get(j, kk->first);
					}
					kend = pV->rowEnd(i);
					for(Iter kk = pV->rowBegin(i); kk != kend; kk++)
					{
						if(kk->first >= (unsigned int)n)
							break;
						if(kk->first >= (unsigned int)l)
							pV->set(j, kk->first, pV->get(j, kk->first) + s * kk->second);
					}
				}
			}
			for(j = l; j < n; j++)
			{
				pV->set(i, j, 0.0);
				pV->set(j, i, 0.0);
			}
		}
		pV->set(i, i, 1.0);
		g = temp[i];
		l = i;
	}

	// Accumulate left-hand transform
	for(i = n - 1; i >= 0; i--)
	{
		l = i + 1;
		g = pSigma[i];
		if(i < n - 1)
		{
			for(j = l; j < n; j++)
				pU->set(i, j, 0.0);
		}
		if(g != 0.0)
		{
			g = 1.0 / g;
			if(i != n - 1)
			{
				for(j = l; j < n; j++)
				{
					s = 0.0;
					Iter kend = pU->colEnd(i);
					for(Iter kk = pU->colBegin(i); kk != kend; kk++)
					{
						if(kk->first >= (unsigned int)l)
							s += kk->second * pU->get(kk->first, j);
					}
					f = (s / pU->get(i, i)) * g;
					if(f != 0.0)
					{
						for(Iter kk = pU->colBegin(i); kk != kend; kk++)
						{
							if(kk->first >= (unsigned int)i)
								pU->set(kk->first, j, pU->get(kk->first, j) + f * kk->second);
						}
					}
				}
			}
			for(j = i; j < m; j++)
				pU->set(j, i, pU->get(j, i) * g);
		} 
		else 
		{
			for(j = i; j < m; j++)
				pU->set(j, i, 0.0);
		}
		pU->set(i, i, pU->get(i, i) + 1.0);
	}

	// Diagonalize the bidiagonal matrix
	for(k = n - 1; k >= 0; k--) // For each singular value
	{
		for(iter = 1; iter <= maxIters; iter++)
		{
			// Test for splitting
			bool flag = true;
			for(l = k; l >= 0; l--)
			{
				q = l - 1;
				if(ABS(temp[l]) + norm == norm)
				{
					flag = false;
					break;
				}
				if(ABS(pSigma[q]) + norm == norm)
					break;
			}

			if(flag)
			{
				c = 0.0;
				s = 1.0;
				for(i = l; i <= k; i++)
				{
					f = s * temp[i];
					temp[i] *= c;
					if(ABS(f) + norm == norm)
						break;
					g = pSigma[i];
					h = GSparseMatrix_pythag(f, g);
					pSigma[i] = h;
					h = 1.0 / h;
					c = g * h;
					s = -f * h;
					Iter jendi = pU->colEnd(i);
					Iter jendq = pU->colEnd(q);
					Iter jji = pU->colBegin(i);
					Iter jjq = pU->colBegin(q);
					int tpos;
					for(tpos = 0; jji != jendi || jjq != jendq; tpos++)
					{
						if(jjq == jendq || (jji != jendi && jji->first < jjq->first))
						{
							temp2[tpos] = jji->first;
							jji++;
						}
						else
						{
							temp2[tpos] = jjq->first;
							if(jji != jendi && jjq->first == jji->first)
								jji++;
							jjq++;
						}
					}
					for(int tpos2 = 0; tpos2 < tpos; tpos2++)
					{
						y = pU->get((unsigned int)temp2[tpos2], q);
						z = pU->get((unsigned int)temp2[tpos2], i);
						pU->set((unsigned int)temp2[tpos2], q, y * c + z * s);
						pU->set((unsigned int)temp2[tpos2], i, z * c - y * s);
					}
				}
			}

			z = pSigma[k];
			if(l == k)
			{
				// Detect convergence
				if(z < 0.0)
				{
					// Singular value should be positive
					pSigma[k] = -z;
					for(j = 0; j < n; j++)
						pV->set(k, j, pV->get(k, j) * -1.0);
				}
				break;
			}
			if(iter >= maxIters)
				ThrowError("failed to converge");

			// Shift from bottom 2x2 minor
			x = pSigma[l];
			q = k - 1;
			y = pSigma[q];
			g = temp[q];
			h = temp[k];
			f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
			g = GSparseMatrix_pythag(f, 1.0);
			f = ((x - z) * (x + z) + h * ((y / (f + GSparseMatrix_takeSign(g, f))) - h)) / x;

			// QR transform
			c = 1.0;
			s = 1.0;
			for(j = l; j <= q; j++)
			{
				i = j + 1;
				g = temp[i];
				y = pSigma[i];
				h = s * g;
				g = c * g;
				z = GSparseMatrix_pythag(f, h);
				temp[j] = z;
				c = f / z;
				s = h / z;
				f = x * c + g * s;
				g = g * c - x * s;
				h = y * s;
				y = y * c;
				Iter pendi = pV->rowEnd(i);
				Iter pendj = pV->rowEnd(j);
				Iter ppi = pV->rowBegin(i);
				Iter ppj = pV->rowBegin(j);
				int tpos;
				for(tpos = 0; ppi != pendi || ppj != pendj; tpos++)
				{
					if(ppj == pendj || (ppi != pendi && ppi->first < ppj->first))
					{
						temp2[tpos] = ppi->first;
						ppi++;
					}
					else
					{
						temp2[tpos] = ppj->first;
						if(ppi != pendi && ppj->first == ppi->first)
							ppi++;
						ppj++;
					}
				}
				for(int tpos2 = 0; tpos2 < tpos; tpos2++)
				{
					x = pV->get(j, (unsigned int)temp2[tpos2]);
					z = pV->get(i, (unsigned int)temp2[tpos2]);
					pV->set(j, (unsigned int)temp2[tpos2], x * c + z * s);
					pV->set(i, (unsigned int)temp2[tpos2], z * c - x * s);
				}
				z = GSparseMatrix_pythag(f, h);
				pSigma[j] = z;
				if(z != 0.0)
				{
					z = 1.0 / z;
					c = f * z;
					s = h * z;
				}
				f = c * g + s * y;
				x = c * y - s * g;
				pendi = pU->colEnd(i);
				pendj = pU->colEnd(j);
				ppi = pU->colBegin(i);
				ppj = pU->colBegin(j);
				for(tpos = 0; ppi != pendi || ppj != pendj; tpos++)
				{
					if(ppj == pendj || (ppi != pendi && ppi->first < ppj->first))
					{
						temp2[tpos] = ppi->first;
						ppi++;
					}
					else
					{
						temp2[tpos] = ppj->first;
						if(ppi != pendi && ppj->first == ppi->first)
							ppi++;
						ppj++;
					}
				}
				for(int tpos2 = 0; tpos2 < tpos; tpos2++)
				{
					y = pU->get((unsigned int)temp2[tpos2], j);
					z = pU->get((unsigned int)temp2[tpos2], i);
					pU->set((unsigned int)temp2[tpos2], j, y * c + z * s);
					pU->set((unsigned int)temp2[tpos2], i, z * c - y * s);
				}
			}
			temp[l] = 0.0;
			temp[k] = f;
			pSigma[k] = x;
		}
	}

	// Sort the singular values from largest to smallest
	for(i = 1; i < n; i++)
	{
		for(j = i; j > 0; j--)
		{
			if(pSigma[j - 1] >= pSigma[j])
				break;
			pU->swapColumns(j - 1, j);
			pV->swapRows(j - 1, j);
			std::swap(pSigma[j - 1], pSigma[j]);
		}
	}

	// Return results
	*ppU = hU.release();
	*ppDiag = hSigma.release();
	*ppV = hV.release();
}