int CORE_dswptr_ontile(PLASMA_desc descA, int i1, int i2, const int *ipiv, int inc, const double *Akk, int ldak) { double zone = 1.0; int lda; int m = descA.mt == 1 ? descA.m : descA.mb; if ( descA.nt > 1 ) { coreblas_error(1, "Illegal value of descA.nt"); return -1; } if ( i1 < 1 ) { coreblas_error(2, "Illegal value of i1"); return -2; } if ( (i2 < i1) || (i2 > m) ) { coreblas_error(3, "Illegal value of i2"); return -3; } CORE_dlaswp_ontile(descA, i1, i2, ipiv, inc); lda = BLKLDD(descA, 0); cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, m, descA.n, (zone), Akk, ldak, A(0, 0), lda ); return PLASMA_SUCCESS; }
int CORE_zgetrf_incpiv(int M, int N, int IB, PLASMA_Complex64_t *A, int LDA, int *IPIV, int *INFO) { int i, j, k, sb; int iinfo; /* Check input arguments */ *INFO = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(5, "Illegal value of LDA"); return -5; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; k = min(M, N); for(i =0 ; i < k; i += IB) { sb = min(IB, k-i); /* * Factor diagonal and subdiagonal blocks and test for exact singularity. */ iinfo = LAPACKE_zgetf2_work(LAPACK_COL_MAJOR, M-i, sb, &A[LDA*i+i], LDA, &IPIV[i]); /* * Adjust INFO and the pivot indices. */ if((*INFO == 0) && (iinfo > 0)) *INFO = iinfo + i; if (i+sb < N) { CORE_zgessm( M-i, N-(i+sb), sb, sb, &IPIV[i], &A[LDA*i+i], LDA, &A[LDA*(i+sb)+i], LDA); } for(j = i; j < i+sb; j++) { IPIV[j] = i + IPIV[j]; } } return PLASMA_SUCCESS; }
int CORE_cgetrf_reclap(int M, int N, PLASMA_Complex32_t *A, int LDA, int *IPIV, int *info) { int thidx = info[1]; int thcnt = min( info[2], M / N ); int minMN = min(M, N); if( M < 0 ) { coreblas_error(1, "illegal value of M"); return -1; } if( N < 0 ) { coreblas_error(2, "illegal value of N"); return -2; } if( LDA < max(1, M) ) { coreblas_error(5, "illegal value of LDA"); return -5; } /* * Quick return */ if ( (M == 0) || (N == 0) || (thidx >= thcnt) ){ return PLASMA_SUCCESS; } *info = 0; CORE_cgetrf_reclap_rec( M, minMN, A, LDA, IPIV, info, thidx, thcnt, 0 ); if ( N > minMN ) { CORE_cgetrf_reclap_update(M, 0, minMN, N-minMN, A, LDA, IPIV, thidx, thcnt); } return info[0]; }
int CORE_dgetrf_rectil(const PLASMA_desc A, int *IPIV, int *info) { int ft, lt; int thidx = info[1]; int thcnt = min( info[2], A.mt ); int minMN = min( A.m, A.n ); double pivot; info[0] = 0; info[2] = thcnt; if ( A.nt > 1 ) { coreblas_error(1, "Illegal value of A.nt"); info[0] = -1; return -1; } if ( thidx >= thcnt ) return 0; int q = A.mt / thcnt; int r = A.mt % thcnt; if (thidx < r) { q++; ft = thidx * q; lt = ft + q; } else { ft = r * (q + 1) + (thidx - r) * q; lt = ft + q; lt = min( lt, A.mt ); } CORE_dgetrf_rectil_rec( A, IPIV, info, &pivot, thidx, thcnt, 0, minMN, ft, lt); if ( A.n > minMN ) { CORE_dgetrf_rectil_update( A, IPIV, 0, minMN, A.n-minMN, thidx, thcnt, ft, lt); } return info[0]; }
int CORE_ztsmqr_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, PLASMA_Complex64_t *A1, int lda1, PLASMA_Complex64_t *A2, int lda2, PLASMA_Complex64_t *V, int ldv, PLASMA_Complex64_t *T, int ldt, PLASMA_Complex64_t *WORK, int ldwork) { int i, j; if ( (m1 != n1) ) { coreblas_error(3, "Illegal value of M1, N1"); return -3; } /* in-place transposition of A1 */ for (j = 0; j < n1; j++){ A1[j + j*lda1] = conj(A1[j + j*lda1]); for (i = j+1; i < m1; i++){ *WORK = *(A1 + i + j*lda1); *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1)); *(A1 + j + i*lda1) = conj(*WORK); } } CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); /* in-place transposition of A1 */ for (j = 0; j < n1; j++){ A1[j + j*lda1] = conj(A1[j + j*lda1]); for (i = j+1; i < m1; i++){ *WORK = *(A1 + i + j*lda1); *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1)); *(A1 + j + i*lda1) = conj(*WORK); } } return PLASMA_SUCCESS; }
int CORE_chbelr(int uplo, int N, PLASMA_desc *A, PLASMA_Complex32_t *V, PLASMA_Complex32_t *TAU, int st, int ed, int eltsize) { int NB, J1, J2; int len1, len2, t1ed, t2st; int i; static PLASMA_Complex32_t zzero = 0.0; PLASMA_desc vA=*A; /* Check input arguments */ if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (ed <= st) { coreblas_error(23, "Illegal value of st and ed (internal)"); return -23; } /* Quick return */ if (N == 0) return PLASMA_SUCCESS; NB = A->mb; if( uplo == PlasmaLower ) { /* ======================== * LOWER CASE * ========================*/ for (i = ed; i >= st+1 ; i--){ /* generate Householder to annihilate a(i+k-1,i) within the band */ *V(i) = *A(i, (st-1)); *A(i, (st-1)) = zzero; LAPACKE_clarfg_work( 2, A((i-1),(st-1)), V(i), 1, TAU(i)); /* apply reflector from the left (horizontal row) and from the right for only the diagonal 2x2.*/ J1 = st; J2 = i-2; t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; /* can be negative */ len2 = J2-t2st+1; if(len1>0)CORE_clarfx2(PlasmaLeft, len1 , *V(i), conjf(*TAU(i)), A(i-1, J1 ), ELTLDD(vA, i-1), A(i, J1 ), ELTLDD(vA, i) ); if(len2>0)CORE_clarfx2(PlasmaLeft, len2 , *V(i), conjf(*TAU(i)), A(i-1, t2st), ELTLDD(vA, i-1), A(i, t2st), ELTLDD(vA, i) ); CORE_clarfx2c(PlasmaLower, *V(i), *TAU(i), A(i-1, i-1), A(i, i-1), A(i, i)); } /* APPLY RIGHT ON THE REMAINING ELEMENT OF KERNEL 1 */ for (i = ed; i >= st+1 ; i--){ J1 = i+1; J2 = min(ed,N); t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; /* can be negative */ len2 = J2-t2st+1; if(len1>0)CORE_clarfx2(PlasmaRight, len1, *V(i), *TAU(i), A(J1, i-1), ELTLDD(vA, J1) , A(J1 , i), ELTLDD(vA, J1) ); if(len2>0)CORE_clarfx2(PlasmaRight, len2, *V(i), *TAU(i), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st) ); } }else{ /* ======================== * UPPER CASE * ========================*/ for (i = ed; i >= st+1 ; i--){ /* generate Householder to annihilate a(i+k-1,i) within the band*/ *V(i) = *A((st-1), i); *A((st-1), i) = zzero; LAPACKE_clarfg_work( 2, A(st-1, i-1), V(i), 1, TAU(i)); /* apply reflector from the left (horizontal row) and from the right for only the diagonal 2x2.*/ J1 = st; J2 = i-2; t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; /* can be negative */ len2 = J2-t2st+1; if(len1>0)CORE_clarfx2(PlasmaRight, len1, conjf(*V(i)), conjf(*TAU(i)), A(J1, i-1), ELTLDD(vA, J1) , A(J1 , i), ELTLDD(vA, J1) ); if(len2>0)CORE_clarfx2(PlasmaRight, len2, conjf(*V(i)), conjf(*TAU(i)), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st) ); CORE_clarfx2c(PlasmaUpper, *V(i), *TAU(i), A(i-1, i-1), A(i-1, i), A(i,i)); } /* APPLY LEFT ON THE REMAINING ELEMENT OF KERNEL 1 */ for (i = ed; i >= st+1 ; i--){ J1 = i+1; J2 = min(ed,N); t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; /* can be negative */ len2 = J2-t2st+1; if(len1>0)CORE_clarfx2(PlasmaLeft, len1 , conjf(*V(i)), *TAU(i), A(i-1, J1 ), ELTLDD(vA, i-1), A(i, J1 ), ELTLDD(vA, i) ); if(len2>0)CORE_clarfx2(PlasmaLeft, len2 , conjf(*V(i)), *TAU(i), A(i-1, t2st), ELTLDD(vA, i-1), A(i, t2st), ELTLDD(vA, i) ); } } /* end of else for the upper case */ return PLASMA_SUCCESS; }
int CORE_zgeqrt(int M, int N, int IB, PLASMA_Complex64_t *A, int LDA, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK) { int i, k, sb; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(5, "Illegal value of LDA"); return -5; } if ((LDT < max(1,IB)) && (IB > 0)) { coreblas_error(7, "Illegal value of LDT"); return -7; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; k = min(M, N); for(i = 0; i < k; i += IB) { sb = min(IB, k-i); LAPACKE_zgeqr2_work(LAPACK_COL_MAJOR, M-i, sb, &A[LDA*i+i], LDA, &TAU[i], WORK); LAPACKE_zlarft_work(LAPACK_COL_MAJOR, lapack_const(PlasmaForward), lapack_const(PlasmaColumnwise), M-i, sb, &A[LDA*i+i], LDA, &TAU[i], &T[LDT*i], LDT); if (N > i+sb) { LAPACKE_zlarfb_work( LAPACK_COL_MAJOR, lapack_const(PlasmaLeft), lapack_const(PlasmaConjTrans), lapack_const(PlasmaForward), lapack_const(PlasmaColumnwise), M-i, N-i-sb, sb, &A[LDA*i+i], LDA, &T[LDT*i], LDT, &A[LDA*(i+sb)+i], LDA, WORK, N-i-sb); } } return PLASMA_SUCCESS; }
int CORE_dlaswp_ontile(PLASMA_desc descA, int i1, int i2, const int *ipiv, int inc) { int i, j, ip, it; double *A1; int lda1, lda2; /* Change i1 to C notation */ i1--; /* Check parameters */ if ( descA.nt > 1 ) { coreblas_error(1, "Illegal value of descA.nt"); return -1; } if ( i1 < 0 ) { coreblas_error(2, "Illegal value of i1"); return -2; } if ( (i2 <= i1) || (i2 > descA.m) ) { coreblas_error(3, "Illegal value of i2"); return -3; } if ( ! ( (i2 - i1 - i1%descA.mb -1) < descA.mb ) ) { coreblas_error(2, "Illegal value of i1,i2. They have to be part of the same block."); return -3; } if (inc > 0) { it = i1 / descA.mb; A1 = A(it, 0); lda1 = BLKLDD(descA, 0); for (j = i1; j < i2; ++j, ipiv+=inc) { ip = (*ipiv) - descA.i - 1; if ( ip != j ) { it = ip / descA.mb; i = ip % descA.mb; lda2 = BLKLDD(descA, it); cblas_dswap(descA.n, A1 + j, lda1, A(it, 0) + i, lda2 ); } } } else { it = (i2-1) / descA.mb; A1 = A(it, 0); lda1 = BLKLDD(descA, it); i1--; ipiv = &ipiv[(1-i2)*inc]; for (j = i2-1; j > i1; --j, ipiv+=inc) { ip = (*ipiv) - descA.i - 1; if ( ip != j ) { it = ip / descA.mb; i = ip % descA.mb; lda2 = BLKLDD(descA, it); cblas_dswap(descA.n, A1 + j, lda1, A(it, 0) + i, lda2 ); } } } return PLASMA_SUCCESS; }
int CORE_zttrfb(int side, int trans, int direct, int storev, int M1, int N1, int M2, int N2, int K, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *V, int LDV, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *WORK, int LDWORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t mzone = -1.0; int j, vi; /* Check input arguments */ if (M1 < 0) { coreblas_error(5, "Illegal value of M1"); return -5; } if (N1 < 0) { coreblas_error(6, "Illegal value of N1"); return -6; } if ((M2 < 0) || ( (side == PlasmaRight) && (M1 != M2) ) ) { coreblas_error(7, "Illegal value of M2"); return -7; } if ((N2 < 0) || ( (side == PlasmaLeft) && (N1 != N2) ) ) { coreblas_error(8, "Illegal value of N2"); return -8; } if (K < 0) { coreblas_error(9, "Illegal value of K"); return -9; } /* Quick return */ if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) return PLASMA_SUCCESS; if (storev == PlasmaColumnwise) { if (direct == PlasmaForward) { /* * Let V = ( V1 ) (first K rows) * ( V2 ) * where V2 is non-unit upper triangular */ if (side == PlasmaLeft) { /* * Colwise / Forward / Left * ------------------------- * * Form H * A or H' * A where A = ( A1 ) * ( A2 ) * where A2 = ( A2_1 ) * ( A2_2 ) */ /* * W = A1 + V' * A2 */ /* * W = A2_2 */ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), K, N2, &A2[M2-K], LDA2, WORK, LDWORK); /* * W = V2' * A2_2 */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasConjTrans, CblasNonUnit, K, N2, CBLAS_SADDR(zone), &V[M2-K], LDV, WORK, LDWORK); if (M2 > K) { /* * W = W + V1' * A2_1 */ cblas_zgemm( CblasColMajor, CblasConjTrans, CblasNoTrans, K, N2, M2-K, CBLAS_SADDR(zone), V, LDV, A2, LDA2, CBLAS_SADDR(zone), WORK, LDWORK); } /* * W = A1 + W */ for(j = 0; j < N1; j++) { cblas_zaxpy(K, CBLAS_SADDR(zone), &A1[LDA1*j], 1, &WORK[LDWORK*j], 1); } /* * A2 = A2 - V * T * W -> W = T * W, A2 = A2 - V * W */ /* * W = T * W */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2, CBLAS_SADDR(zone), T, LDT, WORK, LDWORK); /* * A1 = A1 - W */ for(j = 0; j < N1; j++) { cblas_zaxpy(K, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } /* * A2_1 = A2_1 - V1 * W */ if (M2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2-K, N2, K, CBLAS_SADDR(mzone), V, LDV, WORK, LDWORK, CBLAS_SADDR(zone), A2, LDA2); } /* * W = - V2 * W */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, K, N2, CBLAS_SADDR(mzone), &V[M2-K], LDV, WORK, LDWORK); /* * A2_2 = A2_2 + W */ for(j = 0; j < N2; j++) { cblas_zaxpy( K, CBLAS_SADDR(zone), &WORK[LDWORK*j], 1, &A2[LDA2*j+(M2-K)], 1); } } else { /* * Colwise / Forward / Right * ------------------------- * * Form H * A or H' * A where: * * A = ( A1 A2 ) * * A2 = ( A2_1 : A2_2 ) * * A2_1 is M2 x (M2-K) * A2_2 is M2 x K * * V = ( V_1 ) * ( V_2 ) * * V_1 is full and (N2-K) x K * V_2 is upper triangular and K x K */ /* * W = ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) * * W is M x K * A1 is M x K * A2 is M x N2 split as (A2_1 A2_2) such as * A2_1 is (N2-K) x K * A2_2 is M x K */ /* W = A2_2 */ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), M2, K, &A2[LDA2*(N2-K)], LDA2, WORK, LDWORK); /* W = W * V_2 --> W = A2_2 * V_2 */ cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, M2, K, CBLAS_SADDR(zone), &V[N2-K], LDV, WORK, LDWORK); /* W = W + A2_1 * V_1 */ if (N2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, K, N2-K, CBLAS_SADDR(zone), A2, LDA2, V, LDV, CBLAS_SADDR(zone), WORK, LDWORK); } /* W = A1 + W */ for (j = 0; j < K; j++) { cblas_zaxpy(M1, CBLAS_SADDR(zone), &A1[LDA1*j], 1, &WORK[LDWORK*j], 1); } /* W = W * T --> ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) */ cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M2, K, CBLAS_SADDR(zone), T, LDT, WORK, LDWORK); /* * A1 = A1 - W */ for(j = 0; j < K; j++) { cblas_zaxpy(M1, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } /* * A2 = A2 - W * V' --> A2 - W*V_1' - W*V_2' */ /* A2 = A2 - W * V_1' */ if (N2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasConjTrans, M2, N2-K, K, CBLAS_SADDR(mzone), WORK, LDWORK, V, LDV, CBLAS_SADDR(zone), A2, LDA2); } /* A2 = A2 - W * V_2' */ cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasConjTrans, CblasNonUnit, M2, K, CBLAS_SADDR(mzone), &V[N2-K], LDV, WORK, LDWORK); for(j = 0; j < K; j++) { cblas_zaxpy( M2, CBLAS_SADDR(zone), &WORK[LDWORK*j], 1, &A2[LDA2*(j+N2-K)], 1); } } } else { coreblas_error(3, "Not implemented (ColWise / Backward / Left or Right)"); return PLASMA_ERR_NOT_SUPPORTED; } } else { /* * Rowwise */ if (direct == PlasmaForward) { /* * Let V = ( V1 V2 ) (V1: first K cols) * * where V2 is non-unit lower triangular */ if (side == PlasmaLeft) { /* * Rowwise / Forward / Left * ------------------------- * * Form H * A or H' * A where A = ( A1 ) * ( A2 ) * where A2 = ( A2_1 ) * ( A2_2 ) */ /* V_2 first element */ vi = LDV*(M2-K); /* * W = A1 + V * A2 */ /* * W = A2_2 */ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), K, N2, &A2[M2-K], LDA2, WORK, LDWORK); /* * W = V2 * A2_2 */ //**DB CblasColMajor, CblasLeft, CblasUpper, cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, K, N2, CBLAS_SADDR(zone), &V[vi], LDV, WORK, LDWORK); if (M2 > K) { /* * W = W + V1 * A2_1 */ cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, K, N2, M2-K, CBLAS_SADDR(zone), V, LDV, A2, LDA2, CBLAS_SADDR(zone), WORK, LDWORK); } /* * W = A1 + W */ for(j = 0; j < N1; j++) { cblas_zaxpy( K, CBLAS_SADDR(zone), &A1[LDA1*j], 1, &WORK[LDWORK*j], 1); } /* * W = T * W */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2, CBLAS_SADDR(zone), T, LDT, WORK, LDWORK); /* * A1 = A1 - W */ for(j = 0; j < N1; j++) { cblas_zaxpy( K, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } /* * A2 = A2 - V' * T * W -> A2 = A2 - V' * W */ /* * A2_1 = A2_1 - V1' * W */ if (M2 > K) { cblas_zgemm( CblasColMajor, CblasConjTrans, CblasNoTrans, M2-K, N2, K, CBLAS_SADDR(mzone), V, LDV, WORK, LDWORK, CBLAS_SADDR(zone), A2, LDA2); } /* * W = - V2' * W */ cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasConjTrans, CblasNonUnit, K, N2, CBLAS_SADDR(mzone), &V[vi], LDV, WORK, LDWORK); /* * A2_2 = A2_2 + W */ for(j = 0; j < N2; j++) { cblas_zaxpy( K, CBLAS_SADDR(zone), &WORK[LDWORK*j], 1, &A2[LDA2*j+(M2-K)], 1); } } else { /* * Rowwise / Forward / Right * ------------------------- * * Form H * A or H' * A where: * * A = ( A1 A2 ) * * A2 = ( A2_1 : A2_2 ) * * A2_1 is M2 x (M2-K) * A2_2 is M2 x K * * V = ( V_1 ) * ( V_2 ) * * V_1 is full and (N2-K) x K * V_2 is lower triangular and K x K */ /* * W = ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) * * W is M x K * A1 is M x K * A2 is M x N2 split as (A2_1 A2_2) such as * A2_1 is (N2-K) x K * A2_2 is M x K */ /* V_2 and A2_2 first element */ vi = LDV*(N2-K); /* W = A2_2 */ LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), M2, K, &A2[LDA2*(N2-K)], LDA2, WORK, LDWORK); /* W = W * V_2' --> W = A2_2 * V_2' */ cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasConjTrans, CblasNonUnit, M2, K, CBLAS_SADDR(zone), &V[vi], LDV, WORK, LDWORK); /* W = W + A2_1 * V_1' */ if (N2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasConjTrans, M2, K, N2-K, CBLAS_SADDR(zone), A2, LDA2, V, LDV, CBLAS_SADDR(zone), WORK, LDWORK); } /* W = A1 + W */ for (j = 0; j < K; j++) { cblas_zaxpy(M1, CBLAS_SADDR(zone), &A1[LDA1*j], 1, &WORK[LDWORK*j], 1); } /* W = W * op(T) --> ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) */ cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M2, K, CBLAS_SADDR(zone), T, LDT, WORK, LDWORK); /* * A1 = A1 - W */ for(j = 0; j < K; j++) { cblas_zaxpy(M1, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } /* * A2 = A2 - W * V --> A2 - W*V_1 - W*V_2 */ /* A2 = A2 - W * V_1 */ if (N2 > K) { cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, N2-K, K, CBLAS_SADDR(mzone), WORK, LDWORK, V, LDV, CBLAS_SADDR(zone), A2, LDA2); } /* A2 = A2 - W * V_2 */ cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, M2, K, CBLAS_SADDR(mzone), &V[vi], LDV, WORK, LDWORK); for(j = 0; j < K; j++) { cblas_zaxpy( M2, CBLAS_SADDR(zone), &WORK[LDWORK*j], 1, &A2[LDA2*(N2-K+j)], 1); } } } else { coreblas_error(3, "Not implemented (Rowwise / Backward / Left or Right)"); return PLASMA_ERR_NOT_SUPPORTED; } } return PLASMA_SUCCESS; }
int CORE_stsrfb(int side, int trans, int direct, int storev, int M1, int N1, int M2, int N2, int K, float *A1, int LDA1, float *A2, int LDA2, float *V, int LDV, float *T, int LDT, float *WORK, int LDWORK) { static float zone = 1.0; static float mzone = -1.0; int j; /* Check input arguments */ if (M1 < 0) { coreblas_error(5, "Illegal value of M1"); return -5; } if (N1 < 0) { coreblas_error(6, "Illegal value of N1"); return -6; } if ( (M2 < 0) || ( (M2 != M1) && (side == PlasmaRight) ) ){ coreblas_error(7, "Illegal value of M2"); return -7; } if ( (N2 < 0) || ( (N2 != N1) && (side == PlasmaLeft) ) ){ coreblas_error(8, "Illegal value of N2"); return -8; } if (K < 0) { coreblas_error(9, "Illegal value of K"); return -9; } /* Quick return */ if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) return PLASMA_SUCCESS; if (storev == PlasmaColumnwise) { if (direct == PlasmaForward) { if (side == PlasmaLeft) { /* * B = A1 + V' * A2 */ LAPACKE_slacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), K, N1, A1, LDA1, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, K, N2, M2, (zone), V, LDV, A2, LDA2, (zone), WORK, LDWORK); /* * A2 = A2 - V*T*B -> B = T*B, A2 = A2 - V*B */ cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2, (zone), T, LDT, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, N2, K, (mzone), V, LDV, WORK, LDWORK, (zone), A2, LDA2); /* * A1 = A1 - B */ for(j = 0; j < N1; j++) { cblas_saxpy( K, (mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } } /* * Columnwise / Forward / Right */ else { /* * B = A1 + A2 * V */ LAPACKE_slacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), M1, K, A1, LDA1, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, K, N2, (zone), A2, LDA2, V, LDV, (zone), WORK, LDWORK); /* * A2 = A2 - B*T*V' -> B = B*T, A2 = A2 - B*V' */ cblas_strmm( CblasColMajor, CblasRight, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M1, K, (zone), T, LDT, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, M2, N2, K, (mzone), WORK, LDWORK, V, LDV, (zone), A2, LDA2); /* * A1 = A1 - B */ for(j = 0; j < K; j++) { cblas_saxpy( M1, (mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } } } else { coreblas_error(3, "Not implemented (ColMajor / Backward / Left or Right)"); return PLASMA_ERR_NOT_SUPPORTED; } } else { if (direct == PlasmaForward) { /* * Rowwise / Forward / Left */ if (side == PlasmaLeft) { /* * B = A1 + V * A2 */ LAPACKE_slacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), K, N1, A1, LDA1, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, K, N2, M2, (zone), V, LDV, A2, LDA2, (zone), WORK, LDWORK); /* * A2 = A2 - V'*T*B -> B = T*B, A2 = A2 - V'*B */ cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2, (zone), T, LDT, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, M2, N2, K, (mzone), V, LDV, WORK, LDWORK, (zone), A2, LDA2); /* * A1 = A1 - B */ for(j=0; j<N1; j++) { cblas_saxpy( K, (mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } } /* * Rowwise / Forward / Right */ else { /* * B = A1 + A2 * V' */ LAPACKE_slacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), M1, K, A1, LDA1, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, M2, K, N2, (zone), A2, LDA2, V, LDV, (zone), WORK, LDWORK); /* * A2 = A2 - B*T*V -> B = B*T, A2 = A2 - B*V' */ cblas_strmm( CblasColMajor, CblasRight, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M1, K, (zone), T, LDT, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, N2, K, (mzone), WORK, LDWORK, V, LDV, (zone), A2, LDA2); /* * A1 = A1 - B */ for(j = 0; j < K; j++) { cblas_saxpy( M1, (mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } } } else { coreblas_error(3, "Not implemented (RowMajor / Backward / Left or Right)"); return PLASMA_ERR_NOT_SUPPORTED; } } return PLASMA_SUCCESS; }
int CORE_dtsmlq_corner( int m1, int n1, int m2, int n2, int m3, int n3, int k, int ib, int nb, double *A1, int lda1, double *A2, int lda2, double *A3, int lda3, double *V, int ldv, double *T, int ldt, double *WORK, int ldwork) { PLASMA_enum side; PLASMA_enum trans; int i, j; if ( m1 != n1 ) { coreblas_error(1, "Illegal value of M1, N1"); return -1; } /* Rebuild the symmetric block: WORK <- A1 */ for (i = 0; i < m1; i++) for (j = i; j < n1; j++){ *(WORK + i + j*ldwork) = *(A1 + i + j*lda1); if (j > i){ *(WORK + j + i*ldwork) = ( *(WORK + i + j*ldwork) ); } } /* Copy the transpose of A2: WORK+nb*ldwork <- A2' */ for (j = 0; j < n2; j++) for (i = 0; i < m2; i++){ *(WORK + j + (i + nb) * ldwork) = ( *(A2 + i + j*lda2) ); } side = PlasmaRight; trans = PlasmaTrans; /* Right application on |A1 A2| */ CORE_dtsmlq(side, trans, m1, n1, m2, n2, k, ib, WORK, ldwork, A2, lda2, V, ldv, T, ldt, WORK+3*nb*ldwork, ldwork); /* Rebuild the symmetric block: WORK+2*nb*ldwork <- A3 */ for (i = 0; i < m3; i++) for (j = i; j < n3; j++){ *(WORK + i + (j + 2*nb) * ldwork) = *(A3 + i + j*lda3); if (j > i){ *(WORK + j + (i + 2*nb) * ldwork) = ( *(WORK + i + (j + 2*nb) * ldwork) ); } } /* Right application on | A2' A3 | */ CORE_dtsmlq(side, trans, n2, m2, m3, n3, k, ib, WORK+nb*ldwork, ldwork, WORK+2*nb*ldwork, ldwork, V, ldv, T, ldt, WORK + 3*nb*ldwork, ldwork); side = PlasmaLeft; trans = PlasmaNoTrans; /* Left application on | A1 | */ /* | A2' | */ CORE_dtsmlq(side, trans, m1, n1, n2, m2, k, ib, WORK, ldwork, WORK+nb*ldwork, ldwork, V, ldv, T, ldt, WORK + 3*nb*ldwork, ldwork); /* Copy back the final result to the upper part of A1 */ /* A1 = WORK */ for (i = 0; i < m1; i++) for (j = i; j < n1; j++) *(A1 + i + j*lda1) = *(WORK + i + j*ldwork); /* Left application on | A2 | */ /* | A3 | */ CORE_dtsmlq(side, trans, m2, n2, m3, n3, k, ib, A2, lda2, WORK+2*nb*ldwork, ldwork, V, ldv, T, ldt, WORK + 3*nb*ldwork, ldwork); /* Copy back the final result to the upper part of A3 */ /* A3 = WORK+2*nb*ldwork */ for (i = 0; i < m3; i++) for (j = i; j < n3; j++) *(A3 + i + j*lda3) = *(WORK + i + (j+ 2*nb) * ldwork); return PLASMA_SUCCESS; }
int CORE_shbrce(int uplo, int N, PLASMA_desc *A, float *V, float *TAU, int st, int ed, int eltsize) { int NB, J1, J2, J3, KDM2, len, pt; int len1, len2, t1ed, t2st; int i; static float zzero = 0.0; PLASMA_desc vA=*A; /* Check input arguments */ if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (ed <= st) { coreblas_error(6, "Illegal value of st and ed (internal)"); return -6; } /* Quick return */ if (N == 0) return PLASMA_SUCCESS; NB = A->mb; KDM2 = A->mb-2; if( uplo == PlasmaLower ) { /* ======================== * LOWER CASE * ========================*/ for (i = ed; i >= st+1 ; i--){ /* apply Householder from the right. and create newnnz outside the band if J3 < N */ J1 = ed+1; J2 = min((i+1+KDM2), N); J3 = min((J2+1), N); len = J3-J1+1; if(J3>J2)*A(J3,(i-1))=zzero;/* could be removed because A is supposed to be band.*/ t1ed = (J3/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; /* can be negative*/ len2 = J3-t2st+1; if(len1>0)CORE_slarfx2(PlasmaRight, len1, *V(i), *TAU(i), A(J1, i-1), ELTLDD(vA, J1), A(J1 , i), ELTLDD(vA, J1) ); if(len2>0)CORE_slarfx2(PlasmaRight, len2, *V(i), *TAU(i), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st)); /* if nonzero element a(j+kd,j-1) has been created outside the band (if index < N) then eliminate it.*/ len = J3-J2; // soit 1 soit 0 if(len>0){ /* generate Householder to annihilate a(j+kd,j-1) within the band */ *V(J3) = *A(J3,i-1); *A(J3,i-1) = 0.0; LAPACKE_slarfg_work( 2, A(J2,i-1), V(J3), 1, TAU(J3)); } } /* APPLY LEFT ON THE REMAINING ELEMENT OF KERNEL 2 */ for (i = ed; i >= st+1 ; i--){ /* find if there was a nnz created. if yes apply left else nothing to be done.*/ J2 = min((i+1+KDM2), N); J3 = min((J2+1), N); len = J3-J2; if(len>0){ pt = J2; J1 = i; J2 = min(ed,N); t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; /* can be negative*/ len2 = J2-t2st+1; if(len1>0)CORE_slarfx2(PlasmaLeft, len1 , *V(J3), (*TAU(J3)), A(pt, i ), ELTLDD(vA, pt), A((pt+1), i ), ELTLDD(vA, pt+1) ); if(len2>0)CORE_slarfx2(PlasmaLeft, len2 , *V(J3), (*TAU(J3)), A(pt, t2st), ELTLDD(vA, pt), A((pt+1), t2st), ELTLDD(vA, pt+1) ); } } } else { /* ======================== * UPPER CASE * ========================*/ for (i = ed; i >= st+1 ; i--){ /* apply Householder from the right. and create newnnz outside the band if J3 < N */ J1 = ed+1; J2 = min((i+1+KDM2), N); J3 = min((J2+1), N); len = J3-J1+1; if(J3>J2)*A((i-1), J3)=zzero;/* could be removed because A is supposed to be band.*/ t1ed = (J3/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; /* can be negative*/ len2 = J3-t2st+1; if(len1>0)CORE_slarfx2(PlasmaLeft, len1 , (*V(i)), *TAU(i), A(i-1, J1 ), ELTLDD(vA, (i-1)), A(i, J1 ), ELTLDD(vA, i) ); if(len2>0)CORE_slarfx2(PlasmaLeft, len2 , (*V(i)), *TAU(i), A(i-1, t2st), ELTLDD(vA, (i-1)), A(i, t2st), ELTLDD(vA, i) ); /* if nonzero element a(j+kd,j-1) has been created outside the band (if index < N) then eliminate it.*/ len = J3-J2; /* either 1 soit 0*/ if(len>0){ /* generate Householder to annihilate a(j+kd,j-1) within the band*/ *V(J3) = *A((i-1), J3); *A((i-1), J3) = 0.0; LAPACKE_slarfg_work( 2, A((i-1), J2), V(J3), 1, TAU(J3)); } } /* APPLY RIGHT ON THE REMAINING ELEMENT OF KERNEL 2*/ for (i = ed; i >= st+1 ; i--){ /* find if there was a nnz created. if yes apply right else nothing to be done.*/ J2 = min((i+1+KDM2), N); J3 = min((J2+1), N); len = J3-J2; if(len>0){ pt = J2; J1 = i; J2 = min(ed,N); t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; /* can be negative*/ len2 = J2-t2st+1; if(len1>0)CORE_slarfx2(PlasmaRight, len1 , (*V(J3)), (*TAU(J3)), A(i , pt), ELTLDD(vA, i), A(i, pt+1), ELTLDD(vA, i) ); if(len2>0)CORE_slarfx2(PlasmaRight, len2 , (*V(J3)), (*TAU(J3)), A(t2st, pt), ELTLDD(vA, t2st), A(t2st, pt+1), ELTLDD(vA, t2st) ); } } } /* end of else for the upper case */ return PLASMA_SUCCESS; }
int CORE_cttmlq(int side, int trans, int M1, int N1, int M2, int N2, int K, int IB, PLASMA_Complex32_t *A1, int LDA1, PLASMA_Complex32_t *A2, int LDA2, PLASMA_Complex32_t *V, int LDV, PLASMA_Complex32_t *T, int LDT, PLASMA_Complex32_t *WORK, int LDWORK) { int i, i1, i3; int NQ, NW; int kb; int ic = 0; int jc = 0; int mi1 = M1; int mi2 = M2; int ni1 = N1; int ni2 = N2; /* Check input arguments */ if ((side != PlasmaLeft) && (side != PlasmaRight)) { coreblas_error(1, "Illegal value of side"); return -1; } /* NQ is the order of Q */ if (side == PlasmaLeft) { NQ = N2; NW = IB; } else { NQ = M2; NW = N1; } if ((trans != PlasmaNoTrans) && (trans != PlasmaConjTrans)) { coreblas_error(2, "Illegal value of trans"); return -2; } if (M1 < 0) { coreblas_error(3, "Illegal value of M1"); return -3; } if (N1 < 0) { coreblas_error(4, "Illegal value of N1"); return -4; } if ((M2 < 0) || ( (side == PlasmaRight) && (M1 != M2) ) ) { coreblas_error(5, "Illegal value of M2"); return -5; } if ((N2 < 0) || ( (side == PlasmaLeft) && (N1 != N2) ) ) { coreblas_error(6, "Illegal value of N2"); return -6; } if ((K < 0) || ( (side == PlasmaLeft) && (K > M1) ) || ( (side == PlasmaRight) && (K > N1) ) ) { coreblas_error(7, "Illegal value of K"); return -7; } if (IB < 0) { coreblas_error(8, "Illegal value of IB"); return -8; } if (LDA1 < max(1,M1)){ coreblas_error(10, "Illegal value of LDA1"); return -10; } if (LDA2 < max(1,M2)){ coreblas_error(12, "Illegal value of LDA2"); return -12; } if (LDV < max(1,NQ)){ coreblas_error(14, "Illegal value of LDV"); return -14; } if (LDT < max(1,IB)){ coreblas_error(16, "Illegal value of LDT"); return -16; } if (LDWORK < max(1,NW)){ coreblas_error(18, "Illegal value of LDWORK"); return -18; } /* Quick return */ if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) return PLASMA_SUCCESS; if (((side == PlasmaLeft) && (trans == PlasmaNoTrans)) || ((side == PlasmaRight) && (trans != PlasmaNoTrans))) { i1 = 0; i3 = IB; } else { i1 = ( ( K-1 ) / IB )*IB; i3 = -IB; } /* Transpose */ if (trans == PlasmaNoTrans) { trans = PlasmaConjTrans; } else { trans = PlasmaNoTrans; } for (i = i1; (i > -1) && (i < K); i+=i3) { kb = min(IB, K-i); if (side == PlasmaLeft) { mi1 = M1 - i; mi2 = i + kb; ic = i; } else { /* ni2 must be the number of rows of V */ ni1 = kb; ni2 = i + kb; /*if ((K == IB) && (NeqIB > 0)) { ni2 = N2 - i; }*/ jc = i; } /* * H or H' is applied to C(i:m,1:n) */ CORE_cttrfb( side, trans, PlasmaForward, PlasmaRowwise, mi1, ni1, mi2, ni2, kb, &A1[LDA1*jc+ic], LDA1, A2, LDA2, &V[i], LDV, &T[LDT*i], LDT, WORK, LDWORK); } return PLASMA_SUCCESS; }
int CORE_sormqr(int side, int trans, int M, int N, int K, int IB, float *A, int LDA, float *T, int LDT, float *C, int LDC, float *WORK, int LDWORK) { int i, kb; int i1, i3; int nq, nw; int ic = 0; int jc = 0; int ni = N; int mi = M; /* Check input arguments */ if ((side != PlasmaLeft) && (side != PlasmaRight)) { coreblas_error(1, "Illegal value of side"); return -1; } /* * NQ is the order of Q and NW is the minimum dimension of WORK */ if (side == PlasmaLeft) { nq = M; nw = N; } else { nq = N; nw = M; } if ((trans != PlasmaNoTrans) && (trans != PlasmaTrans)) { coreblas_error(2, "Illegal value of trans"); return -2; } if (M < 0) { coreblas_error(3, "Illegal value of M"); return -3; } if (N < 0) { coreblas_error(4, "Illegal value of N"); return -4; } if ((K < 0) || (K > nq)) { coreblas_error(5, "Illegal value of K"); return -5; } if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) { coreblas_error(6, "Illegal value of IB"); return -6; } if ((LDA < max(1,nq)) && (nq > 0)) { coreblas_error(8, "Illegal value of LDA"); return -8; } if ((LDC < max(1,M)) && (M > 0)) { coreblas_error(12, "Illegal value of LDC"); return -12; } if ((LDWORK < max(1,nw)) && (nw > 0)) { coreblas_error(14, "Illegal value of LDWORK"); return -14; } /* Quick return */ if ((M == 0) || (N == 0) || (K == 0)) return PLASMA_SUCCESS; if (((side == PlasmaLeft) && (trans != PlasmaNoTrans)) || ((side == PlasmaRight) && (trans == PlasmaNoTrans))) { i1 = 0; i3 = IB; } else { i1 = ( ( K-1 ) / IB )*IB; i3 = -IB; } for(i = i1; (i >- 1) && (i < K); i+=i3 ) { kb = min(IB, K-i); if (side == PlasmaLeft) { /* * H or H' is applied to C(i:m,1:n) */ mi = M - i; ic = i; } else { /* * H or H' is applied to C(1:m,i:n) */ ni = N - i; jc = i; } /* * Apply H or H' */ LAPACKE_slarfb_work(LAPACK_COL_MAJOR, lapack_const(side), lapack_const(trans), lapack_const(PlasmaForward), lapack_const(PlasmaColumnwise), mi, ni, kb, &A[LDA*i+i], LDA, &T[LDT*i], LDT, &C[LDC*jc+ic], LDC, WORK, LDWORK); } return PLASMA_SUCCESS; }
/***************************************************************************//** * * @ingroup CORE_double * * ZPAMM performs one of the matrix-matrix operations * * LEFT RIGHT * OP PlasmaW : W = A1 + op(V) * A2 or W = A1 + A2 * op(V) * OP PlasmaA2 : A2 = A2 - op(V) * W or A2 = A2 - W * op(V) * * where op( V ) is one of * * op( V ) = V or op( V ) = V**T or op( V ) = V**T, * * A1, A2 and W are general matrices, and V is: * * l = k: rectangle + triangle * l < k: rectangle + trapezoid * l = 0: rectangle * * Size of V, both rowwise and columnwise, is: * * ---------------------- * side trans size * ---------------------- * left N M x K * T K x M * right N K x N * T N x K * ---------------------- * * LEFT (columnwise and rowwise): * * | K | | M | * _ __________ _ _______________ _ * | | | | | \ * V: | | | V': |_____________|___\ K * | | | M-L | | * M | | | |__________________| _ * |____| | _ * \ | | | M - L | L | * \ | | L * _ \|____| _ * * * RIGHT (columnwise and rowwise): * * | K | | N | * _______________ _ _ __________ _ * | | \ | | | * V': |_____________|___\ N V: | | | * | | | | | K-L * |__________________| _ K | | | * |____| | _ * | K - L | L | \ | | * \ | | L * _ \|____| _ * * Arguments * ========== * * @param[in] op * * OP specifies which operation to perform: * * @arg PlasmaW : W = A1 + op(V) * A2 or W = A1 + A2 * op(V) * @arg PlasmaA2 : A2 = A2 - op(V) * W or A2 = A2 - W * op(V) * * @param[in] side * * SIDE specifies whether op( V ) multiplies A2 * or W from the left or right as follows: * * @arg PlasmaLeft : multiply op( V ) from the left * OP PlasmaW : W = A1 + op(V) * A2 * OP PlasmaA2 : A2 = A2 - op(V) * W * * @arg PlasmaRight : multiply op( V ) from the right * OP PlasmaW : W = A1 + A2 * op(V) * OP PlasmaA2 : A2 = A2 - W * op(V) * * @param[in] storev * * Indicates how the vectors which define the elementary * reflectors are stored in V: * * @arg PlasmaColumnwise * @arg PlasmaRowwise * * @param[in] M * The number of rows of the A1, A2 and W * If SIDE is PlasmaLeft, the number of rows of op( V ) * * @param[in] N * The number of columns of the A1, A2 and W * If SIDE is PlasmaRight, the number of columns of op( V ) * * @param[in] K * If SIDE is PlasmaLeft, the number of columns of op( V ) * If SIDE is PlasmaRight, the number of rows of op( V ) * * @param[in] L * The size of the triangular part of V * * @param[in] A1 * On entry, the M-by-N tile A1. * * @param[in] LDA1 * The leading dimension of the array A1. LDA1 >= max(1,M). * * @param[in,out] A2 * On entry, the M-by-N tile A2. * On exit, if OP is PlasmaA2 A2 is overwritten * * @param[in] LDA2 * The leading dimension of the tile A2. LDA2 >= max(1,M). * * @param[in] V * The matrix V as described above. * If SIDE is PlasmaLeft : op( V ) is M-by-K * If SIDE is PlasmaRight: op( V ) is K-by-N * * @param[in] LDV * The leading dimension of the array V. * * @param[in,out] W * On entry, the M-by-N matrix W. * On exit, W is overwritten either if OP is PlasmaA2 or PlasmaW. * If OP is PlasmaA2, W is an input and is used as a workspace. * * @param[in] LDW * The leading dimension of array WORK. * ******************************************************************************* * * @return * \retval PLASMA_SUCCESS successful exit * \retval <0 if -i, the i-th argument had an illegal value * ******************************************************************************/ int CORE_dpamm(int op, PLASMA_enum side, PLASMA_enum storev, int M, int N, int K, int L, const double *A1, int LDA1, double *A2, int LDA2, const double *V, int LDV, double *W, int LDW) { int vi2, vi3, uplo, trans, info; /* Check input arguments */ if ((op != PlasmaW) && (op != PlasmaA2)) { coreblas_error(1, "Illegal value of op"); return -1; } if ((side != PlasmaLeft) && (side != PlasmaRight)) { coreblas_error(2, "Illegal value of side"); return -2; } if ((storev != PlasmaColumnwise) && (storev != PlasmaRowwise)) { coreblas_error(3, "Illegal value of storev"); return -3; } if (M < 0) { coreblas_error(4, "Illegal value of M"); return -4; } if (N < 0) { coreblas_error(5, "Illegal value of N"); return -5; } if (K < 0) { coreblas_error(6, "Illegal value of K"); return -6; } if (L < 0) { coreblas_error(7, "Illegal value of L"); return -7; } if (LDA1 < 0) { coreblas_error(9, "Illegal value of LDA1"); return -9; } if (LDA2 < 0) { coreblas_error(11, "Illegal value of LDA2"); return -11; } if (LDV < 0) { coreblas_error(13, "Illegal value of LDV"); return -13; } if (LDW < 0) { coreblas_error(15, "Illegal value of LDW"); return -15; } /* Quick return */ if ((M == 0) || (N == 0) || (K == 0)) return PLASMA_SUCCESS; /* * TRANS is set as: * * ------------------------------------- * side direct PlasmaW PlasmaA2 * ------------------------------------- * left colwise T N * rowwise N T * right colwise N T * rowwise T N * ------------------------------------- */ /* Columnwise*/ if (storev == PlasmaColumnwise) { uplo = CblasUpper; if (side == PlasmaLeft) { trans = op == PlasmaA2 ? PlasmaNoTrans : PlasmaTrans; vi2 = trans == PlasmaNoTrans ? M - L : K - L; } else { trans = op == PlasmaW ? PlasmaNoTrans : PlasmaTrans; vi2 = trans == PlasmaNoTrans ? K - L : N - L; } vi3 = LDV * L; } /* Rowwise */ else { uplo = CblasLower; if (side == PlasmaLeft) { trans = op == PlasmaW ? PlasmaNoTrans : PlasmaTrans; vi2 = trans == PlasmaNoTrans ? K - L : M - L; } else { trans = op == PlasmaA2 ? PlasmaNoTrans : PlasmaTrans; vi2 = trans == PlasmaNoTrans ? N - L : K - L; } vi2 *= LDV; vi3 = L; } /**/ if (op==PlasmaW) { info = CORE_dpamm_w( side, trans, uplo, M, N, K, L, vi2, vi3, A1, LDA1, A2, LDA2, V, LDV, W, LDW); if (info != 0) return info; } else if (op==PlasmaA2) { info = CORE_dpamm_a2( side, trans, uplo, M, N, K, L, vi2, vi3, A2, LDA2, V, LDV, W, LDW); if (info != 0) return info; } return PLASMA_SUCCESS; }
int CORE_dgelqt(int M, int N, int IB, double *A, int LDA, double *T, int LDT, double *TAU, double *WORK) { int i, k, sb; int iinfo; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(5, "Illegal value of LDA"); return -5; } if ((LDT < max(1,IB)) && (IB > 0)) { coreblas_error(7, "Illegal value of LDT"); return -7; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; k = min(M, N); for(i = 0; i < k; i += IB) { sb = min(IB, k-i); iinfo = LAPACKE_dgelq2_work(LAPACK_COL_MAJOR, sb, N-i, &A[LDA*i+i], LDA, &TAU[i], WORK); LAPACKE_dlarft_work(LAPACK_COL_MAJOR, lapack_const(PlasmaForward), lapack_const(PlasmaRowwise), N-i, sb, &A[LDA*i+i], LDA, &TAU[i], &T[LDT*i], LDT); if (M > i+sb) { LAPACKE_dlarfb_work( LAPACK_COL_MAJOR, lapack_const(PlasmaRight), lapack_const(PlasmaNoTrans), lapack_const(PlasmaForward), lapack_const(PlasmaRowwise), M-i-sb, N-i, sb, &A[LDA*i+i], LDA, &T[LDT*i], LDT, &A[LDA*i+(i+sb)], LDA, WORK, M-i-sb); } } return PLASMA_SUCCESS; }
int CORE_dtstrf(int M, int N, int IB, int NB, double *U, int LDU, double *A, int LDA, double *L, int LDL, int *IPIV, double *WORK, int LDWORK, int *INFO) { static double zzero = 0.0; static double mzone =-1.0; double alpha; int i, j, ii, sb; int im, ip; /* Check input arguments */ *INFO = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDU < max(1,NB)) && (NB > 0)) { coreblas_error(6, "Illegal value of LDU"); return -6; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA"); return -8; } if ((LDL < max(1,IB)) && (IB > 0)) { coreblas_error(10, "Illegal value of LDL"); return -10; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* Set L to 0 */ memset(L, 0, LDL*N*sizeof(double)); ip = 0; for (ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for (i = 0; i < sb; i++) { im = cblas_idamax(M, &A[LDA*(ii+i)], 1); IPIV[ip] = ii+i+1; if (fabs(A[LDA*(ii+i)+im]) > fabs(U[LDU*(ii+i)+ii+i])) { /* * Swap behind. */ cblas_dswap(i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK ); /* * Swap ahead. */ cblas_dswap(sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA ); /* * Set IPIV. */ IPIV[ip] = NB + im + 1; for (j = 0; j < i; j++) { A[LDA*(ii+j)+im] = zzero; } } if ((*INFO == 0) && (fabs(A[LDA*(ii+i)+im]) == zzero) && (fabs(U[LDU*(ii+i)+ii+i]) == zzero)) { *INFO = ii+i+1; } alpha = ((double)1. / U[LDU*(ii+i)+ii+i]); cblas_dscal(M, (alpha), &A[LDA*(ii+i)], 1); cblas_dcopy(M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1); cblas_dger( CblasColMajor, M, sb-i-1, (mzone), &A[LDA*(ii+i)], 1, &U[LDU*(ii+i+1)+ii+i], LDU, &A[LDA*(ii+i+1)], LDA ); ip = ip+1; } /* * Apply the subpanel to the rest of the panel. */ if(ii+i < N) { for(j = ii; j < ii+sb; j++) { if (IPIV[j] <= NB) { IPIV[j] = IPIV[j] - ii; } } CORE_dssssm( NB, N-(ii+sb), M, N-(ii+sb), sb, sb, &U[LDU*(ii+sb)+ii], LDU, &A[LDA*(ii+sb)], LDA, &L[LDL*ii], LDL, WORK, LDWORK, &IPIV[ii]); for(j = ii; j < ii+sb; j++) { if (IPIV[j] <= NB) { IPIV[j] = IPIV[j] + ii; } } } } return PLASMA_SUCCESS; }
int CORE_dtstrf_cublas(int M, int N, int IB, int NB, double *U, int LDU, double *A, int LDA, double *L, int LDL, int *IPIV, double *WORK, int LDWORK, int *INFO) { static double zzero = 0.0; static double mzone =-1.0; cublasStatus_t status; cudaError_t err; double alpha; int i, j, ii, sb; int im, ip; #if CONFIG_VERBOSE fprintf(stdout, "%s: M=%d N=%d IB=%d NB=%d U=%p LDU=%d A=%p LDA=%d L=%p LDL=%d IPIV=%p WORK=%p LDWORK=%d\n", __FUNCTION__, M, N, IB, NB, U, LDU, A, LDA, L, LDL, IPIV, WORK, LDWORK); fflush(stdout); #endif /* Check input arguments */ *INFO = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDU < max(1,NB)) && (NB > 0)) { coreblas_error(6, "Illegal value of LDU"); return -6; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA"); return -8; } if ((LDL < max(1,IB)) && (IB > 0)) { coreblas_error(10, "Illegal value of LDL"); return -10; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* Set L to 0 */ err = cudaMemset(L, 0, LDL*N*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* dev_ptr = 0; err = cudaMalloc((void**)&dev_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* host_ptr; err = cudaMallocHost((void**)&host_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); int* piv = kaapi_memory_get_host_pointer_and_validate(IPIV); ip = 0; for (ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for (i = 0; i < sb; i++) { status = cublasIdamax(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &im ); PLASMA_CUBLAS_ASSERT(status); /* get im */ err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); /* ajust index, CUBLAS is 1-based indexing */ im--; piv[ip] = ii+i+1; core_dtstrf_cmp(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if (host_ptr[0] == 1.0f) { /* * Swap behind. */ status = cublasDswap(kaapi_cuda_cublas_handle(), i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK ); PLASMA_CUBLAS_ASSERT(status); /* * Swap ahead. */ status = cublasDswap(kaapi_cuda_cublas_handle(), sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA ); PLASMA_CUBLAS_ASSERT(status); /* * Set IPIV. */ piv[ip] = NB + im + 1; core_dtstrf_set_zero(kaapi_cuda_kernel_stream(), A, LDA, i, ii, im, zzero ); } core_dtstrf_cmp_zzero_and_get_alpha(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], zzero, dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if ((*INFO == 0) && (host_ptr[0] == 1.0f)) { *INFO = ii+i+1; } // alpha = ((double)1. / U[LDU*(ii+i)+ii+i]); alpha = host_ptr[1]; status = cublasDscal(kaapi_cuda_cublas_handle(), M, &alpha, &A[LDA*(ii+i)], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDcopy(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDger(kaapi_cuda_cublas_handle(), M, sb-i-1, &mzone, &A[LDA*(ii+i)], 1, &U[LDU*(ii+i+1)+ii+i], LDU, &A[LDA*(ii+i+1)], LDA ); PLASMA_CUBLAS_ASSERT(status); ip = ip+1; } /* * Apply the subpanel to the rest of the panel. */ if(ii+i < N) { for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] - ii; } } CORE_dssssm_cublas_v2( NB, N-(ii+sb), M, N-(ii+sb), sb, sb, &U[LDU*(ii+sb)+ii], LDU, &A[LDA*(ii+sb)], LDA, &L[LDL*ii], LDL, WORK, LDWORK, &piv[ii] ); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] + ii; } } } } cudaFreeHost(host_ptr); cudaFree(dev_ptr); return PLASMA_SUCCESS; }
int CORE_zgblrx(int uplo, int N, PLASMA_desc *A, PLASMA_Complex64_t *V, PLASMA_Complex64_t *TAU, int st, int ed, int eltsize) { int NB, J1, J2; int len1, len2, t1ed, t2st; int i; PLASMA_desc vA=*A; /* Check input arguments */ if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (ed <= st) { coreblas_error(6, "Illegal value of st and ed (internal)"); return -6; } /* Quick return */ if (N == 0) return PLASMA_SUCCESS; NB = A->mb; if( uplo == PlasmaLower ){ /* ======================== * LOWER CASE * ========================*/ for (i = ed; i >= st+1 ; i--){ /* apply reflector from the left (horizontal row) and from the right for only the diagonal 2x2.*/ J1 = st; J2 = i-2; t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; len2 = J2-t2st+1; if(len1>0)CORE_zlarfx2(PlasmaLeft, len1 , *V(i), conj(*TAU(i)), A(i-1, J1 ), ELTLDD(vA, (i-1)), A(i, J1 ), ELTLDD(vA, i) ); if(len2>0)CORE_zlarfx2(PlasmaLeft, len2 , *V(i), conj(*TAU(i)), A(i-1, t2st), ELTLDD(vA, (i-1)), A(i, t2st), ELTLDD(vA, i) ); CORE_zlarfx2ce(PlasmaLower, V(i), TAU(i), A(i-1,i-1), A(i,i-1), A(i,i)); } /* APPLY RIGHT ON THE REMAINING ELEMENT OF KERNEL 1 */ for (i = ed; i >= st+1 ; i--){ J1 = i+1; J2 = min(ed,N); t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; len2 = J2-t2st+1; if(len1>0)CORE_zlarfx2(PlasmaRight, len1, conj(*V(i)), conj(*TAU(i)), A(J1,i-1), ELTLDD(vA, J1) , A(J1 , i), ELTLDD(vA, J1) ); if(len2>0)CORE_zlarfx2(PlasmaRight, len2, conj(*V(i)), conj(*TAU(i)), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st) ); } } else { /* ======================== * UPPER CASE * ========================*/ for (i = ed; i >= st+1 ; i--){ /* apply reflector from the left (horizontal row) and from the right for only the diagonal 2x2.*/ J1 = st; J2 = i-2; t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; len2 = J2-t2st+1; if(len1>0)CORE_zlarfx2(PlasmaRight, len1, conj(*V(i)), conj(*TAU(i)), A(J1,i-1), ELTLDD(vA, J1) , A(J1 , i), ELTLDD(vA, J1) ); if(len2>0)CORE_zlarfx2(PlasmaRight, len2, conj(*V(i)), conj(*TAU(i)), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st) ); CORE_zlarfx2ce(PlasmaUpper, V(i), TAU(i), A(i-1,i-1), A(i-1, i), A(i,i)); } /* APPLY LEFT ON THE REMAINING ELEMENT OF KERNEL 1 */ for (i = ed; i >= st+1 ; i--){ J1 = i+1; J2 = min(ed,N); t1ed = (J2/NB)*NB; t2st = max(t1ed+1,J1); len1 = t1ed-J1+1; len2 = J2-t2st+1; if(len1>0)CORE_zlarfx2(PlasmaLeft, len1 , *V(i), conj(*TAU(i)), A(i-1, J1 ), ELTLDD(vA, (i-1)), A(i, J1 ), ELTLDD(vA, i) ); if(len2>0)CORE_zlarfx2(PlasmaLeft, len2 , *V(i), conj(*TAU(i)), A(i-1, t2st), ELTLDD(vA, (i-1)), A(i, t2st), ELTLDD(vA, i) ); } } /* end of else for the upper case */ return PLASMA_SUCCESS; }
int CORE_sgessm(int M, int N, int K, int IB, int *IPIV, float *L, int LDL, float *A, int LDA) { static float zone = 1.0; static float mzone = -1.0; static int ione = 1; int i, sb; int tmp, tmp2; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (K < 0) { coreblas_error(3, "Illegal value of K"); return -3; } if (IB < 0) { coreblas_error(4, "Illegal value of IB"); return -4; } if ((LDL < max(1,M)) && (M > 0)) { coreblas_error(7, "Illegal value of LDL"); return -7; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(9, "Illegal value of LDA"); return -9; } /* Quick return */ if ((M == 0) || (N == 0) || (K == 0) || (IB == 0)) return PLASMA_SUCCESS; for(i = 0; i < K; i += IB) { sb = min(IB, K-i); /* * Apply interchanges to columns I*IB+1:IB*( I+1 )+1. */ tmp = i+1; tmp2 = i+sb; LAPACKE_slaswp_work(LAPACK_COL_MAJOR, N, A, LDA, tmp, tmp2, IPIV, ione); /* * Compute block row of U. */ cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, sb, N, (zone), &L[LDL*i+i], LDL, &A[i], LDA ); if (i+sb < M) { /* * Update trailing submatrix. */ cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M-(i+sb), N, sb, (mzone), &L[LDL*i+(i+sb)], LDL, &A[i], LDA, (zone), &A[i+sb], LDA ); } } return PLASMA_SUCCESS; }
int CORE_ztsmlq(int side, int trans, int M1, int N1, int M2, int N2, int K, int IB, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *V, int LDV, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *WORK, int LDWORK) { int i, i1, i3; int NW; int kb; int ic = 0; int jc = 0; int mi = M1; int ni = N1; /* Check input arguments */ if ((side != PlasmaLeft) && (side != PlasmaRight)) { coreblas_error(1, "Illegal value of side"); return -1; } /* NW is the minimum dimension of WORK */ if (side == PlasmaLeft) { NW = IB; } else { NW = N1; } if ((trans != PlasmaNoTrans) && (trans != PlasmaConjTrans)) { coreblas_error(2, "Illegal value of trans"); return -2; } if (M1 < 0) { coreblas_error(3, "Illegal value of M1"); return -3; } if (N1 < 0) { coreblas_error(4, "Illegal value of N1"); return -4; } if ( (M2 < 0) || ( (M2 != M1) && (side == PlasmaRight) ) ){ coreblas_error(5, "Illegal value of M2"); return -5; } if ( (N2 < 0) || ( (N2 != N1) && (side == PlasmaLeft) ) ){ coreblas_error(6, "Illegal value of N2"); return -6; } if ((K < 0) || ( (side == PlasmaLeft) && (K > M1) ) || ( (side == PlasmaRight) && (K > N1) ) ) { coreblas_error(7, "Illegal value of K"); return -7; } if (IB < 0) { coreblas_error(8, "Illegal value of IB"); return -8; } if (LDA1 < max(1,M1)){ coreblas_error(10, "Illegal value of LDA1"); return -10; } if (LDA2 < max(1,M2)){ coreblas_error(12, "Illegal value of LDA2"); return -12; } if (LDV < max(1,K)){ coreblas_error(14, "Illegal value of LDV"); return -14; } if (LDT < max(1,IB)){ coreblas_error(16, "Illegal value of LDT"); return -16; } if (LDWORK < max(1,NW)){ coreblas_error(18, "Illegal value of LDWORK"); return -18; } /* Quick return */ if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) return PLASMA_SUCCESS; if (((side == PlasmaLeft) && (trans == PlasmaNoTrans)) || ((side == PlasmaRight) && (trans != PlasmaNoTrans))) { i1 = 0; i3 = IB; } else { i1 = ((K-1) / IB)*IB; i3 = -IB; } if (trans == PlasmaNoTrans) { trans = PlasmaConjTrans; } else { trans = PlasmaNoTrans; } for(i = i1; (i > -1) && (i < K); i += i3) { kb = min(IB, K-i); if (side == PlasmaLeft) { /* * H or H' is applied to C(i:m,1:n) */ mi = M1 - i; ic = i; } else { /* * H or H' is applied to C(1:m,i:n) */ ni = N1 - i; jc = i; } /* * Apply H or H' */ CORE_ztsrfb( side, trans, PlasmaForward, PlasmaRowwise, mi, ni, M2, N2, kb, &A1[LDA1*jc+ic], LDA1, A2, LDA2, &V[i], LDV, &T[LDT*i], LDT, WORK, LDWORK); } return PLASMA_SUCCESS; }
int CORE_zttqrt(int M, int N, int IB, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t zzero = 0.0; static int ione = 1; PLASMA_Complex64_t alpha; int i, j, ii, sb, mi, ni; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA2 < max(1,M)) && (M > 0)) { coreblas_error(7, "Illegal value of LDA2"); return -7; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; for(ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for(i = 0; i < sb; i++) { /* * Generate elementary reflector H( II*IB+I ) to annihilate * A( II*IB+I:mi, II*IB+I ). */ mi = ii + i + 1; LAPACKE_zlarfg_work(mi+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], ione, &TAU[ii+i]); if (sb-i-1>0) { /* * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left. */ ni = sb-i-1; cblas_zcopy( ni, &A1[LDA1*(ii+i+1)+(ii+i)], LDA1, WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(ni, WORK, ione); #endif cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, mi, ni, CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zone), WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(ni, WORK, ione); #endif alpha = -conj(TAU[ii+i]); cblas_zaxpy( ni, CBLAS_SADDR(alpha), WORK, 1, &A1[LDA1*(ii+i+1)+ii+i], LDA1); #ifdef COMPLEX LAPACKE_zlacgv_work(ni, WORK, ione); #endif cblas_zgerc( CblasColMajor, mi, ni, CBLAS_SADDR(alpha), &A2[LDA2*(ii+i)], 1, WORK, 1, &A2[LDA2*(ii+i+1)], LDA2); } /* * Calculate T. */ if (i > 0 ) { cblas_zcopy(i, &A2[LDA2*(ii+i)+ii], 1, &WORK[ii], 1); cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaConjTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &A2[LDA2*ii+ii], LDA2, &WORK[ii], 1); alpha = -(TAU[ii+i]); for(j = 0; j < i; j++) { WORK[ii+j] = alpha * WORK[ii+j]; } if (ii > 0) { cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, ii, i, CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zzero), WORK, 1); cblas_zaxpy(i, CBLAS_SADDR(zone), &WORK[ii], 1, WORK, 1); } cblas_zcopy(i, WORK, 1, &T[LDT*(ii+i)], 1); cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &T[LDT*ii], LDT, &T[LDT*(ii+i)], 1); } T[LDT*(ii+i)+i] = TAU[ii+i]; } /* Apply Q' to the rest of the matrix to the left */ if (N > ii+sb) { CORE_zttrfb( PlasmaLeft, PlasmaConjTrans, PlasmaForward, PlasmaColumnwise, sb, N-(ii+sb), ii+sb, N-(ii+sb), sb, &A1[LDA1*(ii+sb)+ii], LDA1, &A2[LDA2*(ii+sb)], LDA2, &A2[LDA2*ii], LDA2, &T[LDT*ii], LDT, WORK, sb); } } return PLASMA_SUCCESS; }
int CORE_zpltmg_chebvand( int M, int N, PLASMA_Complex64_t *A, int LDA, int gN, int m0, int n0, PLASMA_Complex64_t *W ) { PLASMA_Complex64_t step; int i, j, jj; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(4, "Illegal value of LDA"); return -4; } if (m0 < 0) { coreblas_error(6, "Illegal value of m0"); return -6; } if (n0 < 0) { coreblas_error(7, "Illegal value of n0"); return -7; } if (gN < n0+N) { coreblas_error(5, "Illegal value of gN"); return -5; } step = (PLASMA_Complex64_t)1. / (gN - 1.); /* Initialize W if required */ if (m0 == 0) { for (j=0, jj=n0; j<N; j++, jj++) { W[2*j ] = 1.; W[2*j+1] = jj * step; } if ( M == 1 ) { LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', 1, N, W, 2, A, LDA ); return PLASMA_SUCCESS; } LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', 2, N, W, 2, A, LDA ); M -= 2; A += 2; } /* Case NB=1, W contains row 0 and 1 and M should be 1 */ if (m0 == 1) { if (M != 1) { coreblas_error(1, "Illegal value of M for m0 = 1"); return -1; } LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', 1, N, W+1, 2, A, LDA ); return PLASMA_SUCCESS; } for (j=0, jj=n0; j<N; j++, jj++) { /* First line */ if (M > 0) { A[LDA*j] = 2. * jj * step * W[j*2 + 1] - W[j*2 ]; } /* Second line */ if (M > 1) { A[LDA*j+1] = 2. * jj * step * A[LDA*j ] - W[j*2+1]; } for (i=2; i<M; i++) { A[LDA*j+i] = 2. * jj * step * A[LDA*j+i-1] - A[LDA*j+i-2]; } } if ( M == 1 ) { cblas_zcopy( N, W+1, 2, W, 2 ); cblas_zcopy( N, A+M-1, LDA, W+1, 2 ); } else { LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', 2, N, A + M - 2, LDA, W, 2 ); } return PLASMA_SUCCESS; }
int CORE_zttlqt(int M, int N, int IB, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t zzero = 0.0; #ifdef COMPLEX static int ione = 1; #endif PLASMA_Complex64_t alpha; int i, j, l, ii, sb, mi, ni; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA2 < max(1,M)) && (M > 0)) { coreblas_error(7, "Illegal value of LDA2"); return -7; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* TODO: Need to check why some cases require * this to not have uninitialized values */ CORE_zlaset( PlasmaUpperLower, IB, N, 0., 0., T, LDT); for(ii = 0; ii < M; ii += IB) { sb = min(M-ii, IB); for(i = 0; i < sb; i++) { j = ii + i; mi = sb-i-1; ni = min( j + 1, N); /* * Generate elementary reflector H( II*IB+I ) to annihilate A( II*IB+I, II*IB+I:M ). */ #ifdef COMPLEX LAPACKE_zlacgv_work(ni, &A2[j], LDA2); LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1); #endif LAPACKE_zlarfg_work(ni+1, &A1[LDA1*j+j], &A2[j], LDA2, &TAU[j]); if (mi > 0) { /* * Apply H( j-1 ) to A( j:II+IB-1, j-1:M ) from the right. */ cblas_zcopy( mi, &A1[LDA1*j+(j+1)], 1, WORK, 1); cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, mi, ni, CBLAS_SADDR(zone), &A2[j+1], LDA2, &A2[j], LDA2, CBLAS_SADDR(zone), WORK, 1); alpha = -(TAU[j]); cblas_zaxpy( mi, CBLAS_SADDR(alpha), WORK, 1, &A1[LDA1*j+j+1], 1); cblas_zgerc( CblasColMajor, mi, ni, CBLAS_SADDR(alpha), WORK, 1, &A2[j], LDA2, &A2[j+1], LDA2); } /* * Calculate T. */ if (i > 0 ) { l = min(i, max(0, N-ii)); alpha = -(TAU[j]); CORE_zpemv( PlasmaNoTrans, PlasmaRowwise, i , min(j, N), l, alpha, &A2[ii], LDA2, &A2[j], LDA2, zzero, &T[LDT*j], 1, WORK); /* T(0:i-1, j) = T(0:i-1, ii:j-1) * T(0:i-1, j) */ cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &T[LDT*ii], LDT, &T[LDT*j], 1); } #ifdef COMPLEX LAPACKE_zlacgv_work(ni, &A2[j], LDA2 ); LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1 ); #endif T[LDT*j+i] = TAU[j]; } /* Apply Q to the rest of the matrix to the right */ if (M > ii+sb) { mi = M-(ii+sb); ni = min(ii+sb, N); l = min(sb, max(0, ni-ii)); CORE_zparfb( PlasmaRight, PlasmaNoTrans, PlasmaForward, PlasmaRowwise, mi, IB, mi, ni, sb, l, &A1[LDA1*ii+ii+sb], LDA1, &A2[ii+sb], LDA2, &A2[ii], LDA2, &T[LDT*ii], LDT, WORK, M); } } return PLASMA_SUCCESS; }
int CORE_zlatro(PLASMA_enum uplo, PLASMA_enum trans, int M, int N, const PLASMA_Complex64_t *A, int LDA, PLASMA_Complex64_t *B, int LDB) { int i, j; /* Check input arguments */ if ((uplo != PlasmaUpper) && (uplo != PlasmaLower) && (uplo != PlasmaUpperLower) ) { coreblas_error(1, "Illegal value of uplo"); return -1; } if ((trans != PlasmaConjTrans) && (trans != PlasmaNoTrans) && (trans != PlasmaTrans) ) { coreblas_error(2, "Illegal value of trans"); return -2; } if (M < 0) { coreblas_error(3, "Illegal value of M"); return -3; } if (N < 0) { coreblas_error(4, "Illegal value of N"); return -4; } if ( (LDA < max(1,M)) && (M > 0) ) { coreblas_error(6, "Illegal value of LDA"); return -6; } if ( (LDB < max(1,N)) && (N > 0) ) { coreblas_error(8, "Illegal value of LDB"); return -8; } if (trans == PlasmaNoTrans) { CORE_zlacpy(uplo, M, N, A, LDA, B, LDB); } else { if (trans == PlasmaConjTrans) { if(uplo == PlasmaUpper) { for(j=0; j<N; j++) for(i=0; i<min(j+1,M); i++) B[j+i*LDB] = conj(A[i+j*LDA]); } else if(uplo == PlasmaLower) { for(j=0;j<N;j++) for(i=j;i<M;i++) B[j+i*LDB] = conj(A[i+j*LDA]); } else { for(j=0;j<N;j++) for(i=0;i<M;i++) B[j+i*LDB] = conj(A[i+j*LDA]); } } else { if(uplo==PlasmaUpper) { for(j=0;j<N;j++) for(i=0;i<min(j+1,M);i++) B[j+i*LDB] = A[i+j*LDA]; } else if(uplo==PlasmaLower) { for(j=0;j<N;j++) for(i=j;i<M;i++) B[j+i*LDB] = A[i+j*LDA]; } else { for(j=0;j<N;j++) for(i=0;i<M;i++) B[j+i*LDB] = A[i+j*LDA]; } } } return PLASMA_SUCCESS; }
int CORE_ctsqrt(int M, int N, int IB, PLASMA_Complex32_t *A1, int LDA1, PLASMA_Complex32_t *A2, int LDA2, PLASMA_Complex32_t *T, int LDT, PLASMA_Complex32_t *TAU, PLASMA_Complex32_t *WORK) { static PLASMA_Complex32_t zone = 1.0; static PLASMA_Complex32_t zzero = 0.0; PLASMA_Complex32_t alpha; int i, ii, sb; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA2 < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA2"); return -8; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; for(ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for(i = 0; i < sb; i++) { /* * Generate elementary reflector H( II*IB+I ) to annihilate * A( II*IB+I:M, II*IB+I ) */ LAPACKE_clarfg_work(M+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], 1, &TAU[ii+i]); if (ii+i+1 < N) { /* * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left */ alpha = -conjf(TAU[ii+i]); cblas_ccopy( sb-i-1, &A1[LDA1*(ii+i+1)+(ii+i)], LDA1, WORK, 1); #ifdef COMPLEX LAPACKE_clacgv_work(sb-i-1, WORK, 1); #endif cblas_cgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, M, sb-i-1, CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zone), WORK, 1); #ifdef COMPLEX LAPACKE_clacgv_work(sb-i-1, WORK, 1 ); #endif cblas_caxpy( sb-i-1, CBLAS_SADDR(alpha), WORK, 1, &A1[LDA1*(ii+i+1)+ii+i], LDA1); #ifdef COMPLEX LAPACKE_clacgv_work(sb-i-1, WORK, 1 ); #endif cblas_cgerc( CblasColMajor, M, sb-i-1, CBLAS_SADDR(alpha), &A2[LDA2*(ii+i)], 1, WORK, 1, &A2[LDA2*(ii+i+1)], LDA2); } /* * Calculate T */ alpha = -TAU[ii+i]; cblas_cgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, M, i, CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zzero), &T[LDT*(ii+i)], 1); cblas_ctrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &T[LDT*ii], LDT, &T[LDT*(ii+i)], 1); T[LDT*(ii+i)+i] = TAU[ii+i]; } if (N > ii+sb) { CORE_ctsmqr( PlasmaLeft, PlasmaConjTrans, sb, N-(ii+sb), M, N-(ii+sb), IB, IB, &A1[LDA1*(ii+sb)+ii], LDA1, &A2[LDA2*(ii+sb)], LDA2, &A2[LDA2*ii], LDA2, &T[LDT*ii], LDT, WORK, sb); } } return PLASMA_SUCCESS; }
/***************************************************************************//** * * @ingroup CORE_PLASMA_Complex64_t * * CORE_zgetf2_nopiv computes an LU factorization of a general diagonal * dominant M-by-N matrix A witout no pivoting and no blocking. It is the * internal function called by CORE_zgetrf_nopiv(). * * The factorization has the form * A = L * U * where L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 3 BLAS version of the algorithm. * ******************************************************************************* * * @param[in] M * The number of rows of the matrix A. M >= 0. * * @param[in] N * The number of columns of the matrix A. N >= 0. * * @param[in,out] A * On entry, the M-by-N matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * @param[in] LDA * The leading dimension of the array A. LDA >= max(1,M). * ******************************************************************************* * * @return * \retval PLASMA_SUCCESS successful exit * \retval <0 if INFO = -k, the k-th argument had an illegal value * \retval >0 if INFO = k, U(k,k) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * ******************************************************************************/ int CORE_zgetf2_nopiv(int M, int N, PLASMA_Complex64_t *A, int LDA) { PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0; PLASMA_Complex64_t alpha; double sfmin; int i, j, k; int info; /* Check input arguments */ info = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(5, "Illegal value of LDA"); return -5; } /* Quick return */ if ( (M == 0) || (N == 0) ) return PLASMA_SUCCESS; sfmin = LAPACKE_dlamch_work('S'); k = min(M, N); for(i=0 ; i < k; i++) { alpha = A[i*LDA+i]; if ( alpha != (PLASMA_Complex64_t)0.0 ) { /* Compute elements J+1:M of J-th column. */ if (i < M) { if ( cabs(alpha) > sfmin ) { alpha = 1.0 / alpha; cblas_zscal( M-i-1, CBLAS_SADDR(alpha), &(A[i*LDA+i+1]), 1); } else { for(j=i+1; j<M; j++) A[LDA*i+j] = A[LDA*i+j] / alpha; } } } else if ( info == 0 ) { info = i; goto end; } if ( i < k ) { /* Update trailing submatrix */ cblas_zgeru(CblasColMajor, M-i-1, N-i-1, CBLAS_SADDR(mzone), &A[LDA* i +i+1], 1, &A[LDA*(i+1)+i ], LDA, &A[LDA*(i+1)+i+1], LDA); } } end: return info; }