/***************************************************************************//** * Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler **/ void plasma_pzhbrdt_quark(PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; #ifdef COMPLEX static PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0; static double dzero = (double) 0.0; PLASMA_Complex64_t ztmp; double absztmp; #endif PLASMA_Complex64_t *C, *S; int blksweep, lcsweep, blkid, lcNB; int N, NB, NT, grsiz, lcgrsiz; int i; size_t eltsize = plasma_element_size(A.dtyp); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); NT = A.nt; N = A.m; NB = A.mb; /* Quick return */ if (N == 0){ return; } if (NB == 0) { memset(D, 0, N*sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX for (i=0; i<N; i++) D[i] = cabs(*A(i,i)); #else for (i=0; i<N; i++) D[i] = *A(i,i); #endif return; } /* * Barrier is used because the bulge have to wait until * the reduction to band has been finish. * otherwise, I can remove this BARRIER when I integrate * the function dependencies link inside the reduction to * band. Keep in min the case when NB=1, where no bulge-chasing. */ /***************************************************************/ QUARK_Barrier(plasma->quark); tblg = -Wtimming(); /***************************************************************/ /* * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge. * Make diagonal and superdiagonal elements real, storing them in * D and E. if PlasmaLower, first transform lower bidiagonal form * to upper bidiagonal by applying plane rotations/ Householder * from the left, overwriting superdiagonal elements then make * elements real of the resulting upper Bidiagonal. if PlasmaUpper * then make its elements real. For Q, PT: ZSCAL should be done * in case of WANTQ. */ if (NB == 1){ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i<N; i++) { D[i] = creal( *A(i, i) ); /* diag value */ if( i < (N-1)) { /* lower off-diag value */ ztmp = *A((i+1),i); absztmp = cabs(ztmp); *A((i+1),i) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (PLASMA_Complex64_t) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ */ } } } else { /* PlasmaUpper */ for (i=0; i<N; i++) { D[i] = creal( *A(i,i) ); /* diag value*/ if(i<(N-1)) { /* lower off-diag value */ ztmp = *A(i, (i+1)); absztmp = cabs(ztmp); *A(i,(i+1)) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (PLASMA_Complex64_t) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */ } } } /* end PlasmaUpper*/ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif return; } /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */ if( N <= 0 ) /* this will be removed we don t need it. */ { PLASMA_Complex64_t *work, *TTau; int info, ldwork = N*N; work = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, ldwork, PlasmaComplexDouble); TTau = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); info = LAPACKE_zhetrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, A(0,0), A.lm, D, E, TTau, work, ldwork); plasma_shared_free(plasma, (void*) work); plasma_shared_free(plasma, (void*) TTau); if( info == 0 ) sequence->status = PLASMA_SUCCESS; else plasma_sequence_flush(plasma->quark, sequence, request, info); return; } /* General case NB > 1 && N > NB */ C = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); S = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); /*************************************************************************** * START BULGE CHASING CODE **************************************************************************/ /* * Initialisation of local parameter. those parameter should be * input or tuned parameter. */ grsiz = 1; if( NB > 160 ) { grsiz = 1; } else if( NB > 100 ) { grsiz = 1; /* if( N < 5000 ) grsiz = 1; else grsiz = 2; */ } else { grsiz = 2; } grsiz = max(1, grsiz); /*grsiz=1;*/ /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d A.ln %5d A.nb %5d \n",N,NB,lcNB,grsiz,A.ln,A.nb);*/ for (blksweep = 0; blksweep<NT; blksweep++){ lcNB = blksweep == NT-1 ? A.n-blksweep*A.nb : A.nb; /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d blksweep%5d NT %5d \n",N,NB,lcNB,grsiz,blksweep,NT);*/ for (lcsweep = 0; lcsweep<lcNB; lcsweep++){ for (blkid = blksweep; blkid<NT; blkid=blkid+grsiz){ lcgrsiz = (blkid+1) < NT ? grsiz : NT-blkid; /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d lcgrsiz %5d blkid %5d \n",N,NB,lcNB,grsiz,lcgrsiz,blkid);*/ QUARK_CORE_ztrdalg_v2( plasma->quark, &task_flags, uplo, &A, C, S, lcgrsiz, lcsweep, blkid, blksweep); } } } /* * Barrier used only for now, to be sure that everything * is done before copying the D and E and free workspace. * this will be removed later when D and E are directly filled * during the bulge process. */ QUARK_Barrier(plasma->quark); tblg += Wtimming(); printf(" done with bulge %lf \n\n\n",tblg); plasma_shared_free(plasma, (void*) C); plasma_shared_free(plasma, (void*) S); /* * STORE THE RESULTING diagonal/off-diagonal in D AND E */ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i < N-1 ; i++) { D[i] = creal( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if(i<(N-2)) E[i] = creal(*A(i+1, i)); else E[i] = cabs( *A(i+1, i)); } D[i] = creal( *A(i, i) ); } else { /* PlasmaUpper */ for (i=0; i<N-1; i++) { D[i] = creal( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if( i < (N-2) ) E[i] = creal(*A(i, (i+1))); else E[i] = cabs(*A(i, (i+1))); } D[i] = creal( *A(i, i) ); } /* end PlasmaUpper */ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif } /* END FUNCTION */
/***************************************************************************//** * * @ingroup PLASMA_Complex64_t_Tile_Async * * PLASMA_zlansy_Tile_Async - Non-blocking equivalent of PLASMA_zlansy_Tile(). * May return before the computation is finished. * Allows for pipelining of operations at runtime. * ******************************************************************************* * * @param[in] sequence * Identifies the sequence of function calls that this call belongs to * (for completion checks and exception handling purposes). * * @param[out] request * Identifies this function call (for exception handling purposes). * ******************************************************************************* * * @sa PLASMA_zlansy * @sa PLASMA_zlansy_Tile * @sa PLASMA_clansy_Tile_Async * @sa PLASMA_dlansy_Tile_Async * @sa PLASMA_slansy_Tile_Async * ******************************************************************************/ int PLASMA_zlansy_Tile_Async(PLASMA_enum norm, PLASMA_enum uplo, PLASMA_desc *A, double *value, PLASMA_sequence *sequence, PLASMA_request *request) { PLASMA_desc descA; double *work = NULL; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_zlansy_Tile", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } if (sequence == NULL) { plasma_fatal_error("PLASMA_zlansy_Tile", "NULL sequence"); return PLASMA_ERR_UNALLOCATED; } if (request == NULL) { plasma_fatal_error("PLASMA_zlansy_Tile", "NULL request"); return PLASMA_ERR_UNALLOCATED; } /* Check sequence status */ if (sequence->status == PLASMA_SUCCESS) request->status = PLASMA_SUCCESS; else return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED); /* Check descriptors for correctness */ if (plasma_desc_check(A) != PLASMA_SUCCESS) { plasma_error("PLASMA_zlansy_Tile", "invalid descriptor"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } else { descA = *A; } /* Check input arguments */ if (descA.nb != descA.mb) { plasma_error("PLASMA_zlansy_Tile", "only square tiles supported"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if ( (norm != PlasmaMaxNorm) && (norm != PlasmaOneNorm) && (norm != PlasmaInfNorm) && (norm != PlasmaFrobeniusNorm) ) { plasma_error("PLASMA_zlansy_Tile", "illegal value of norm"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if ( (uplo != PlasmaUpper) && (uplo != PlasmaLower) ) { plasma_error("PLASMA_zlansy_Tile", "illegal value of uplo"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* Quick return */ if ( descA.m == 0) { *value = 0.0; return PLASMA_SUCCESS; } if (PLASMA_SCHEDULING == PLASMA_STATIC_SCHEDULING) { if (norm == PlasmaFrobeniusNorm) { work = plasma_shared_alloc(plasma, 2*PLASMA_SIZE, PlasmaRealDouble ); } else { work = plasma_shared_alloc(plasma, PLASMA_SIZE, PlasmaRealDouble ); } } plasma_parallel_call_7(plasma_pzlansy, PLASMA_enum, norm, PLASMA_enum, uplo, PLASMA_desc, descA, double*, work, double*, value, PLASMA_sequence*, sequence, PLASMA_request*, request); if (work != NULL) plasma_shared_free( plasma, work ); return PLASMA_SUCCESS; }
/***************************************************************************//** * Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler **/ void plasma_pdsbrdt_quark(PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; #ifdef COMPLEX static double zone = (double) 1.0; static double dzero = (double) 0.0; double ztmp; double absztmp; #endif double *C, *S; int N, NB, INgrsiz, INthgrsiz, BAND; int myid, grsiz, shift=3, stt, st, ed, stind, edind; int blklastind, colpt, PCOL, ACOL, MCOL; int stepercol, mylastid, grnb, grid; int *DEP,*MAXID; int i, j, m; int thgrsiz, thgrnb, thgrid, thed; size_t eltsize = plasma_element_size(A.dtyp); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); N = A.m; NB = A.mb; /* Quick return */ if (N == 0){ return; } if (NB == 0) { memset(D, 0, N*sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX for (i=0; i<N; i++) D[i] = fabs(*A(i,i)); #else for (i=0; i<N; i++) D[i] = *A(i,i); #endif return; } /* * Barrier is used because the bulge have to wait until * the reduction to band has been finish. * otherwise, I can remove this BARRIER when I integrate * the function dependencies link inside the reduction to * band. Keep in min the case when NB=1, where no bulge-chasing. */ /***************************************************************/ QUARK_Barrier(plasma->quark); tblg = -Wtimming(); /***************************************************************/ /* * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge. * Make diagonal and superdiagonal elements real, storing them in * D and E. if PlasmaLower, first transform lower bidiagonal form * to upper bidiagonal by applying plane rotations/ Householder * from the left, overwriting superdiagonal elements then make * elements real of the resulting upper Bidiagonal. if PlasmaUpper * then make its elements real. For Q, PT: ZSCAL should be done * in case of WANTQ. */ if (NB == 1){ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i<N; i++) { D[i] = ( *A(i, i) ); /* diag value */ if( i < (N-1)) { /* lower off-diag value */ ztmp = *A((i+1),i); absztmp = fabs(ztmp); *A((i+1),i) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (double) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ */ } } } else { /* PlasmaUpper */ for (i=0; i<N; i++) { D[i] = ( *A(i,i) ); /* diag value*/ if(i<(N-1)) { /* lower off-diag value */ ztmp = *A(i, (i+1)); absztmp = fabs(ztmp); *A(i,(i+1)) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (double) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */ } } } /* end PlasmaUpper*/ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif return; } /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */ if( N <= 0 ) /* this will be removed we don t need it. */ { double *work, *TTau; int info, ldwork = N*N; work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble); TTau = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); info = LAPACKE_dsytrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, A(0,0), A.lm, D, E, TTau, work, ldwork); plasma_shared_free(plasma, (void*) work); plasma_shared_free(plasma, (void*) TTau); if( info == 0 ) sequence->status = PLASMA_SUCCESS; else plasma_sequence_flush(plasma->quark, sequence, request, info); return; } /* General case NB > 1 && N > NB */ DEP = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger ); MAXID = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger ); C = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); S = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); memset(MAXID,0,(N+1)*sizeof(int)); /*************************************************************************** * START BULGE CHASING CODE **************************************************************************/ /* * Initialisation of local parameter. those parameter should be * input or tuned parameter. */ INgrsiz = 1; if( NB > 160 ) { INgrsiz = 2; } else if( NB > 100 ) { if( N < 5000 ) INgrsiz = 2; else INgrsiz = 4; } else { INgrsiz = 6; } INthgrsiz = N; BAND = 0; grsiz = INgrsiz; thgrsiz = INthgrsiz; if( grsiz == 0 ) grsiz = 6; if( thgrsiz == 0 ) thgrsiz = N; i = shift/grsiz; stepercol = i*grsiz == shift ? i:i+1; i = (N-2)/thgrsiz; thgrnb = i*thgrsiz == (N-2) ? i:i+1; for (thgrid = 1; thgrid<=thgrnb; thgrid++){ stt = (thgrid-1)*thgrsiz+1; thed = min( (stt + thgrsiz -1), (N-2)); for (i = stt; i <= N-2; i++){ ed=min(i,thed); if(stt>ed)break; for (m = 1; m <=stepercol; m++){ st=stt; for (j = st; j <=ed; j++){ /* PCOL: dependency on the ID of the master of the group of the previous column. (Previous Column:PCOL). */ /* ACOL: dependency on the ID of the master of the previous group of my column. (Acctual Column:ACOL). (it is 0(NULL) for myid=1) */ /* MCOL: OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */ myid = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1; mylastid = myid+grsiz-1; PCOL = mylastid+shift-1; /* to know the dependent ID of the previous column. need to know the master of its group */ MAXID[j] = myid; PCOL = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/ grnb = PCOL/grsiz; grid = grnb*grsiz == PCOL ? grnb:grnb+1; PCOL = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */ ACOL = myid-grsiz; if(myid==1)ACOL=0; MCOL = myid; QUARK_CORE_dtrdalg( plasma->quark, &task_flags, uplo, N, NB, &A, C, S, i, j, m, grsiz, BAND, DEP(PCOL), DEP(ACOL), DEP(MCOL) ); if(mylastid%2 ==0){ blklastind = (mylastid/2)*NB+1+j-1; }else{ colpt = ((mylastid+1)/2)*NB + 1 +j -1 ; stind = colpt-NB+1; edind = min(colpt,N); if( (stind>=edind-1) && (edind==N) ) blklastind=N; else blklastind=0; } if(blklastind >= (N-1)) stt=stt+1; } /* END for j=st:ed */ } /* END for m=1:stepercol */ } /* END for i=1:MINMN-2 */ } /* END for thgrid=1:thgrnb */ /* * Barrier used only for now, to be sure that everything * is done before copying the D and E and free workspace. * this will be removed later when D and E are directly filled * during the bulge process. */ QUARK_Barrier(plasma->quark); tblg += Wtimming(); //printf(" done with bulge %lf \n\n\n",tblg); plasma_shared_free(plasma, (void*) DEP); plasma_shared_free(plasma, (void*) MAXID); plasma_shared_free(plasma, (void*) C); plasma_shared_free(plasma, (void*) S); /* * STORE THE RESULTING diagonal/off-diagonal in D AND E */ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i < N-1 ; i++) { D[i] = ( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if(i<(N-2)) E[i] = (*A(i+1, i)); else E[i] = fabs( *A(i+1, i)); } D[i] = ( *A(i, i) ); } else { /* PlasmaUpper */ for (i=0; i<N-1; i++) { D[i] = ( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if( i < (N-2) ) E[i] = (*A(i, (i+1))); else E[i] = fabs(*A(i, (i+1))); } D[i] = ( *A(i, i) ); } /* end PlasmaUpper */ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif } /* END FUNCTION */
/** **************************************************************************** * * @ingroup InPlaceTransformation * * plasma_dgetmi2 Implementation of inplace transposition * based on the GKK algorithm by Gustavson, Karlsson, Kagstrom. * This algorithm shift some cycles to transpose the matrix. * ******************************************************************************* * * @param[in] m * Number of rows of matrix A * * @param[in] n * Number of columns of matrix A * * @param[in,out] A * Matrix of size L*m*n * * @param[in] nprob * Number of parallel and independant problems * * @param[in] me * Number of rows of the problem * * @param[in] ne * Number of columns in the problem * * @param[in] L * Size of chunk to use for transformation * ******************************************************************************/ int plasma_dshift(plasma_context_t *plasma, int m, int n, double *A, int nprob, int me, int ne, int L, PLASMA_sequence *sequence, PLASMA_request *request) { int *leaders = NULL; int ngrp, thrdbypb, thrdtot, nleaders; /* Check Plasma context */ thrdtot = PLASMA_SIZE; thrdbypb = PLASMA_GRPSIZE; ngrp = thrdtot/thrdbypb; /* check input */ if( (nprob * me * ne * L) != (m * n) ) { plasma_error(__func__, "problem size does not match matrix size"); /*printf("m=%d, n=%d, nprob=%d, me=%d, ne=%d, L=%d\n", m, n, nprob, me, ne, L);*/ return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if( thrdbypb > thrdtot ) { plasma_error(__func__, "number of thread per problem must be less or equal to total number of threads"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } if( (thrdtot % thrdbypb) != 0 ) { plasma_error(__func__, "number of thread per problem must divide the total number of thread"); return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE); } /* quick return */ if( (me < 2) || (ne < 2) || (nprob < 1) ) { return PLASMA_SUCCESS; } GKK_getLeaderNbr(me, ne, &nleaders, &leaders); nleaders *= 3; if (PLASMA_SCHEDULING == PLASMA_STATIC_SCHEDULING) { int *Tp = NULL; int i, ipb; int q, owner; q = me*ne - 1; Tp = (int *)plasma_shared_alloc(plasma, thrdtot, PlasmaInteger); for (i=0; i<thrdtot; i++) Tp[i] = 0; ipb = 0; /* First part with coarse parallelism */ if (nprob > ngrp) { ipb = (nprob / ngrp)*ngrp; /* loop over leader */ if (thrdbypb > 1) { for (i=0; i<nleaders; i+=3) { /* assign this cycle to a thread */ owner = minloc(thrdbypb, Tp); /* assign it to owner */ Tp[owner] = Tp[owner] + leaders[i+1] * L; leaders[i+2] = owner; } GKK_BalanceLoad(thrdbypb, Tp, leaders, nleaders, L); } else { for (i=0; i<nleaders; i+=3) { Tp[0] = Tp[0] + leaders[i+1] * L; leaders[i+2] = 0; } } /* shift in parallel */ for (i=0; i< (nprob/ngrp); i++) { plasma_static_call_9(plasma_pdshift, int, me, int, ne, int, L, double*, &(A[i*ngrp*me*ne*L]), int *, leaders, int, nleaders, int, thrdbypb, PLASMA_sequence*, sequence, PLASMA_request*, request); } }