void CORE_dtstrf_quark(Quark *quark) { int m; int n; int ib; int nb; double *U; int ldu; double *A; int lda; double *L; int ldl; int *IPIV; double *WORK; int ldwork; PLASMA_sequence *sequence; PLASMA_request *request; PLASMA_bool check_info; int iinfo; int info; quark_unpack_args_17(quark, m, n, ib, nb, U, ldu, A, lda, L, ldl, IPIV, WORK, ldwork, sequence, request, check_info, iinfo); CORE_dtstrf(m, n, ib, nb, U, ldu, A, lda, L, ldl, IPIV, WORK, ldwork, &info); if (info != PLASMA_SUCCESS && check_info) plasma_sequence_flush(quark, sequence, request, iinfo + info); }
void CORE_cgetrf_reclap_quark(Quark* quark) { int M; int N; PLASMA_Complex32_t *A; int LDA; int *IPIV; PLASMA_sequence *sequence; PLASMA_request *request; PLASMA_bool check_info; int iinfo; int info[3]; int maxthreads; quark_unpack_args_10(quark, M, N, A, LDA, IPIV, sequence, request, check_info, iinfo, maxthreads ); info[1] = QUARK_Get_RankInTask(quark); info[2] = maxthreads; CORE_cgetrf_reclap( M, N, A, LDA, IPIV, info ); if (info[1] == 0 && info[0] != PLASMA_SUCCESS && check_info) plasma_sequence_flush(quark, sequence, request, iinfo + info[0] ); }
void CORE_zlag2c_quark(Quark *quark) { int m; int n; PLASMA_Complex64_t *A; int lda; PLASMA_Complex32_t *B; int ldb; PLASMA_sequence *sequence; PLASMA_request *request; int info; quark_unpack_args_8(quark, m, n, A, lda, B, ldb, sequence, request); info = LAPACKE_zlag2c_work(LAPACK_COL_MAJOR, m, n, A, lda, B, ldb); if (sequence->status == PLASMA_SUCCESS && info != 0) plasma_sequence_flush(quark, sequence, request, info); }
void CORE_dgetrf_quark(Quark *quark) { int m; int n; double *A; int lda; int *IPIV; PLASMA_sequence *sequence; PLASMA_request *request; PLASMA_bool check_info; int iinfo; int info; quark_unpack_args_9(quark, m, n, A, lda, IPIV, sequence, request, check_info, iinfo); info = LAPACKE_dgetrf_work(LAPACK_COL_MAJOR, m, n, A, lda, IPIV ); if (info != PLASMA_SUCCESS && check_info) plasma_sequence_flush(quark, sequence, request, iinfo+info); }
void CORE_zgetrf_incpiv_quark(Quark *quark) { int m; int n; int ib; PLASMA_Complex64_t *A; int lda; int *IPIV; PLASMA_sequence *sequence; PLASMA_request *request; PLASMA_bool check_info; int iinfo; int info; quark_unpack_args_10(quark, m, n, ib, A, lda, IPIV, sequence, request, check_info, iinfo); CORE_zgetrf_incpiv(m, n, ib, A, lda, IPIV, &info); if (info != PLASMA_SUCCESS && check_info) plasma_sequence_flush(quark, sequence, request, iinfo+info); }
void CORE_ctrtri_quark(Quark *quark) { int uplo; int diag; int N; PLASMA_Complex32_t *A; int LDA; PLASMA_sequence *sequence; PLASMA_request *request; int iinfo; int info; quark_unpack_args_8(quark, uplo, diag, N, A, LDA, sequence, request, iinfo); info = LAPACKE_ctrtri_work( LAPACK_COL_MAJOR, lapack_const(uplo), lapack_const(diag), N, A, LDA); if ((sequence->status == PLASMA_SUCCESS) && (info != 0)) plasma_sequence_flush(quark, sequence, request, iinfo + info); }
void CORE_zhegst_quark(Quark *quark) { int itype; PLASMA_enum uplo; int n; PLASMA_Complex64_t *A; int lda; PLASMA_Complex64_t *B; int ldb; PLASMA_sequence *sequence; PLASMA_request *request; int iinfo; int info; quark_unpack_args_10(quark, itype, uplo, n, A, lda, B, ldb, sequence, request, iinfo); info = LAPACKE_zhegst_work( LAPACK_COL_MAJOR, itype, lapack_const(uplo), n, A, lda, B, ldb); if (sequence->status == PLASMA_SUCCESS && info != 0) plasma_sequence_flush(quark, sequence, request, iinfo+info); }
/***************************************************************************//** * Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler **/ void plasma_pzhbrdt_quark(PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; #ifdef COMPLEX static PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0; static double dzero = (double) 0.0; PLASMA_Complex64_t ztmp; double absztmp; #endif PLASMA_Complex64_t *C, *S; int blksweep, lcsweep, blkid, lcNB; int N, NB, NT, grsiz, lcgrsiz; int i; size_t eltsize = plasma_element_size(A.dtyp); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); NT = A.nt; N = A.m; NB = A.mb; /* Quick return */ if (N == 0){ return; } if (NB == 0) { memset(D, 0, N*sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX for (i=0; i<N; i++) D[i] = cabs(*A(i,i)); #else for (i=0; i<N; i++) D[i] = *A(i,i); #endif return; } /* * Barrier is used because the bulge have to wait until * the reduction to band has been finish. * otherwise, I can remove this BARRIER when I integrate * the function dependencies link inside the reduction to * band. Keep in min the case when NB=1, where no bulge-chasing. */ /***************************************************************/ QUARK_Barrier(plasma->quark); tblg = -Wtimming(); /***************************************************************/ /* * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge. * Make diagonal and superdiagonal elements real, storing them in * D and E. if PlasmaLower, first transform lower bidiagonal form * to upper bidiagonal by applying plane rotations/ Householder * from the left, overwriting superdiagonal elements then make * elements real of the resulting upper Bidiagonal. if PlasmaUpper * then make its elements real. For Q, PT: ZSCAL should be done * in case of WANTQ. */ if (NB == 1){ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i<N; i++) { D[i] = creal( *A(i, i) ); /* diag value */ if( i < (N-1)) { /* lower off-diag value */ ztmp = *A((i+1),i); absztmp = cabs(ztmp); *A((i+1),i) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (PLASMA_Complex64_t) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ */ } } } else { /* PlasmaUpper */ for (i=0; i<N; i++) { D[i] = creal( *A(i,i) ); /* diag value*/ if(i<(N-1)) { /* lower off-diag value */ ztmp = *A(i, (i+1)); absztmp = cabs(ztmp); *A(i,(i+1)) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (PLASMA_Complex64_t) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */ } } } /* end PlasmaUpper*/ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif return; } /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */ if( N <= 0 ) /* this will be removed we don t need it. */ { PLASMA_Complex64_t *work, *TTau; int info, ldwork = N*N; work = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, ldwork, PlasmaComplexDouble); TTau = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); info = LAPACKE_zhetrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, A(0,0), A.lm, D, E, TTau, work, ldwork); plasma_shared_free(plasma, (void*) work); plasma_shared_free(plasma, (void*) TTau); if( info == 0 ) sequence->status = PLASMA_SUCCESS; else plasma_sequence_flush(plasma->quark, sequence, request, info); return; } /* General case NB > 1 && N > NB */ C = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); S = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); /*************************************************************************** * START BULGE CHASING CODE **************************************************************************/ /* * Initialisation of local parameter. those parameter should be * input or tuned parameter. */ grsiz = 1; if( NB > 160 ) { grsiz = 1; } else if( NB > 100 ) { grsiz = 1; /* if( N < 5000 ) grsiz = 1; else grsiz = 2; */ } else { grsiz = 2; } grsiz = max(1, grsiz); /*grsiz=1;*/ /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d A.ln %5d A.nb %5d \n",N,NB,lcNB,grsiz,A.ln,A.nb);*/ for (blksweep = 0; blksweep<NT; blksweep++){ lcNB = blksweep == NT-1 ? A.n-blksweep*A.nb : A.nb; /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d blksweep%5d NT %5d \n",N,NB,lcNB,grsiz,blksweep,NT);*/ for (lcsweep = 0; lcsweep<lcNB; lcsweep++){ for (blkid = blksweep; blkid<NT; blkid=blkid+grsiz){ lcgrsiz = (blkid+1) < NT ? grsiz : NT-blkid; /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d lcgrsiz %5d blkid %5d \n",N,NB,lcNB,grsiz,lcgrsiz,blkid);*/ QUARK_CORE_ztrdalg_v2( plasma->quark, &task_flags, uplo, &A, C, S, lcgrsiz, lcsweep, blkid, blksweep); } } } /* * Barrier used only for now, to be sure that everything * is done before copying the D and E and free workspace. * this will be removed later when D and E are directly filled * during the bulge process. */ QUARK_Barrier(plasma->quark); tblg += Wtimming(); printf(" done with bulge %lf \n\n\n",tblg); plasma_shared_free(plasma, (void*) C); plasma_shared_free(plasma, (void*) S); /* * STORE THE RESULTING diagonal/off-diagonal in D AND E */ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i < N-1 ; i++) { D[i] = creal( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if(i<(N-2)) E[i] = creal(*A(i+1, i)); else E[i] = cabs( *A(i+1, i)); } D[i] = creal( *A(i, i) ); } else { /* PlasmaUpper */ for (i=0; i<N-1; i++) { D[i] = creal( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if( i < (N-2) ) E[i] = creal(*A(i, (i+1))); else E[i] = cabs(*A(i, (i+1))); } D[i] = creal( *A(i, i) ); } /* end PlasmaUpper */ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif } /* END FUNCTION */
/***************************************************************************//** * Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler **/ void plasma_pdsbrdt_quark(PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; #ifdef COMPLEX static double zone = (double) 1.0; static double dzero = (double) 0.0; double ztmp; double absztmp; #endif double *C, *S; int N, NB, INgrsiz, INthgrsiz, BAND; int myid, grsiz, shift=3, stt, st, ed, stind, edind; int blklastind, colpt, PCOL, ACOL, MCOL; int stepercol, mylastid, grnb, grid; int *DEP,*MAXID; int i, j, m; int thgrsiz, thgrnb, thgrid, thed; size_t eltsize = plasma_element_size(A.dtyp); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); N = A.m; NB = A.mb; /* Quick return */ if (N == 0){ return; } if (NB == 0) { memset(D, 0, N*sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX for (i=0; i<N; i++) D[i] = fabs(*A(i,i)); #else for (i=0; i<N; i++) D[i] = *A(i,i); #endif return; } /* * Barrier is used because the bulge have to wait until * the reduction to band has been finish. * otherwise, I can remove this BARRIER when I integrate * the function dependencies link inside the reduction to * band. Keep in min the case when NB=1, where no bulge-chasing. */ /***************************************************************/ QUARK_Barrier(plasma->quark); tblg = -Wtimming(); /***************************************************************/ /* * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge. * Make diagonal and superdiagonal elements real, storing them in * D and E. if PlasmaLower, first transform lower bidiagonal form * to upper bidiagonal by applying plane rotations/ Householder * from the left, overwriting superdiagonal elements then make * elements real of the resulting upper Bidiagonal. if PlasmaUpper * then make its elements real. For Q, PT: ZSCAL should be done * in case of WANTQ. */ if (NB == 1){ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i<N; i++) { D[i] = ( *A(i, i) ); /* diag value */ if( i < (N-1)) { /* lower off-diag value */ ztmp = *A((i+1),i); absztmp = fabs(ztmp); *A((i+1),i) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (double) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ */ } } } else { /* PlasmaUpper */ for (i=0; i<N; i++) { D[i] = ( *A(i,i) ); /* diag value*/ if(i<(N-1)) { /* lower off-diag value */ ztmp = *A(i, (i+1)); absztmp = fabs(ztmp); *A(i,(i+1)) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (double) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */ } } } /* end PlasmaUpper*/ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif return; } /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */ if( N <= 0 ) /* this will be removed we don t need it. */ { double *work, *TTau; int info, ldwork = N*N; work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble); TTau = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); info = LAPACKE_dsytrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, A(0,0), A.lm, D, E, TTau, work, ldwork); plasma_shared_free(plasma, (void*) work); plasma_shared_free(plasma, (void*) TTau); if( info == 0 ) sequence->status = PLASMA_SUCCESS; else plasma_sequence_flush(plasma->quark, sequence, request, info); return; } /* General case NB > 1 && N > NB */ DEP = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger ); MAXID = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger ); C = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); S = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); memset(MAXID,0,(N+1)*sizeof(int)); /*************************************************************************** * START BULGE CHASING CODE **************************************************************************/ /* * Initialisation of local parameter. those parameter should be * input or tuned parameter. */ INgrsiz = 1; if( NB > 160 ) { INgrsiz = 2; } else if( NB > 100 ) { if( N < 5000 ) INgrsiz = 2; else INgrsiz = 4; } else { INgrsiz = 6; } INthgrsiz = N; BAND = 0; grsiz = INgrsiz; thgrsiz = INthgrsiz; if( grsiz == 0 ) grsiz = 6; if( thgrsiz == 0 ) thgrsiz = N; i = shift/grsiz; stepercol = i*grsiz == shift ? i:i+1; i = (N-2)/thgrsiz; thgrnb = i*thgrsiz == (N-2) ? i:i+1; for (thgrid = 1; thgrid<=thgrnb; thgrid++){ stt = (thgrid-1)*thgrsiz+1; thed = min( (stt + thgrsiz -1), (N-2)); for (i = stt; i <= N-2; i++){ ed=min(i,thed); if(stt>ed)break; for (m = 1; m <=stepercol; m++){ st=stt; for (j = st; j <=ed; j++){ /* PCOL: dependency on the ID of the master of the group of the previous column. (Previous Column:PCOL). */ /* ACOL: dependency on the ID of the master of the previous group of my column. (Acctual Column:ACOL). (it is 0(NULL) for myid=1) */ /* MCOL: OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */ myid = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1; mylastid = myid+grsiz-1; PCOL = mylastid+shift-1; /* to know the dependent ID of the previous column. need to know the master of its group */ MAXID[j] = myid; PCOL = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/ grnb = PCOL/grsiz; grid = grnb*grsiz == PCOL ? grnb:grnb+1; PCOL = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */ ACOL = myid-grsiz; if(myid==1)ACOL=0; MCOL = myid; QUARK_CORE_dtrdalg( plasma->quark, &task_flags, uplo, N, NB, &A, C, S, i, j, m, grsiz, BAND, DEP(PCOL), DEP(ACOL), DEP(MCOL) ); if(mylastid%2 ==0){ blklastind = (mylastid/2)*NB+1+j-1; }else{ colpt = ((mylastid+1)/2)*NB + 1 +j -1 ; stind = colpt-NB+1; edind = min(colpt,N); if( (stind>=edind-1) && (edind==N) ) blklastind=N; else blklastind=0; } if(blklastind >= (N-1)) stt=stt+1; } /* END for j=st:ed */ } /* END for m=1:stepercol */ } /* END for i=1:MINMN-2 */ } /* END for thgrid=1:thgrnb */ /* * Barrier used only for now, to be sure that everything * is done before copying the D and E and free workspace. * this will be removed later when D and E are directly filled * during the bulge process. */ QUARK_Barrier(plasma->quark); tblg += Wtimming(); //printf(" done with bulge %lf \n\n\n",tblg); plasma_shared_free(plasma, (void*) DEP); plasma_shared_free(plasma, (void*) MAXID); plasma_shared_free(plasma, (void*) C); plasma_shared_free(plasma, (void*) S); /* * STORE THE RESULTING diagonal/off-diagonal in D AND E */ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i < N-1 ; i++) { D[i] = ( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if(i<(N-2)) E[i] = (*A(i+1, i)); else E[i] = fabs( *A(i+1, i)); } D[i] = ( *A(i, i) ); } else { /* PlasmaUpper */ for (i=0; i<N-1; i++) { D[i] = ( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if( i < (N-2) ) E[i] = (*A(i, (i+1))); else E[i] = fabs(*A(i, (i+1))); } D[i] = ( *A(i, i) ); } /* end PlasmaUpper */ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif } /* END FUNCTION */