int CORE_dswptr_ontile(PLASMA_desc descA, int i1, int i2, const int *ipiv, int inc, const double *Akk, int ldak) { double zone = 1.0; int lda; int m = descA.mt == 1 ? descA.m : descA.mb; if ( descA.nt > 1 ) { coreblas_error(1, "Illegal value of descA.nt"); return -1; } if ( i1 < 1 ) { coreblas_error(2, "Illegal value of i1"); return -2; } if ( (i2 < i1) || (i2 > m) ) { coreblas_error(3, "Illegal value of i2"); return -3; } CORE_dlaswp_ontile(descA, i1, i2, ipiv, inc); lda = BLKLDD(descA, 0); cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, m, descA.n, (zone), Akk, ldak, A(0, 0), lda ); return PLASMA_SUCCESS; }
/***************************************************************************//** * Parallel tile LU factorization - dynamic scheduling - Right looking **/ void plasma_pdgetrf_rectil_quark(PLASMA_desc A, int *IPIV) { int k, m, n; int tempk, tempm, tempkm, tempkn, tempmm, tempnn; int ldak, ldam; double zone = (double)1.0; double mzone = (double)-1.0; void * fakedep; /* How many threads per panel? Probably needs to be adjusted during factorization. */ CORE_dgetrf_rectil_init(); for (k = 0; k < min(A.mt, A.nt); k++) { tempk = k * A.mb; tempm = A.m - tempk; tempkm = k == A.mt-1 ? tempm : A.mb; tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; ldak = BLKLDD(A, k); double *dA = A(k, k); int *dB = IPIV(k); PLASMA_desc pDesc = plasma_desc_submatrix(A, tempk, k*A.nb, tempm, tempkn); hclib_pragma_marker("omp", "task depend(inout:dA[0:A.mb*A.nb]) depend(out:dB[0:pDesc.n])", "pragma59_omp_task"); { int info[3]; info[1] = 0; info[2] = 1; CORE_dgetrf_rectil( pDesc, dB, info ); } /* * Update the trailing submatrix */ fakedep = (void *)(intptr_t)(k+1); for (n = k+1; n < A.nt; n++) { /* * Apply row interchange after the panel (work on the panel) */ tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn); double *dA = A(k, n); double *dB = A(k, k); int *dipiv = IPIV(k); hclib_pragma_marker("omp", "task depend(inout:dA[0:1]) depend(in:dB[0:ldak], dipiv[0:tempkm])", "pragma82_omp_task"); CORE_dswptr_ontile(descA, 1, tempkm, dipiv, 1, dB, ldak); m = k+1; if ( m < A.mt ) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); double *dA = A(m , k); double *dB = A(k , n); double *dC = A(m , n); hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb]) depend(inout:dC[0:A.mb*A.mb])", "pragma93_omp_task"); cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans, tempmm, tempnn, A.nb, mzone, dA, ldam, dB, ldak, zone, dC, ldam); for (m = k+2; m < A.mt; m++) { tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb; ldam = BLKLDD(A, m); double *dA = A(m , k); double *dB = A(k , n); double *dC = A(m , n); double *fake1 = A(k+1, n); double *fake2 = (double *)fakedep; hclib_pragma_marker("omp", "task depend(in:dA[0:A.mb*A.mb], dB[0:A.mb*A.mb], fake2[0:1]) depend(inout:dC[0:A.mb*A.mb], fake1[0:A.mb*A.nb])", "pragma110_omp_task"); cblas_dgemm(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_TRANSPOSE)PlasmaNoTrans, tempmm, tempnn, A.nb, mzone, dA, ldam, dB, ldak, zone, dC, ldam); } } } } for (k = 0; k < min(A.mt, A.nt); k++) { int mintmp; tempk = k * A.mb; tempm = A.m - tempk; tempkm = k == A.mt-1 ? tempm : A.mb; tempkn = k == A.nt-1 ? A.n - k * A.nb : A.nb; mintmp = min(tempkm, tempkn); ldak = BLKLDD(A, k); /* * Apply row interchange behind the panel (work on the panel) */ fakedep = (void*)(intptr_t)k; for (n = 0; n < k; n++) { tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb; double *Aij = A(k, n); double *prevSwap = A(k-1, n); int *dipiv = IPIV(k); PLASMA_desc descA = plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn); hclib_pragma_marker("omp", "task depend(inout:Aij[0:1],fakedep) depend(in:dipiv[0:mintmp], prevSwap[0:A.lm*A.nb])", "pragma142_omp_task"); CORE_dlaswp_ontile(descA, 1, mintmp, dipiv, 1); } } }