void warmup(Quark *q){ int NB = 200; double *H = (double*) malloc(NB*NB*OOC_NTHREADS*sizeof(double)); double *D = (double*) offload_Alloc(NB*NB*OOC_NTHREADS*sizeof(double), 0); { Quark_Task_Flags tflags = Quark_Task_Flags_Initializer; // for(int r = 0; r < OOC_NTHREADS; r++){ for(int r = 0; r < 2; r++){ QUARK_Task_Flag_Set(&tflags, TASK_LOCK_TO_THREAD, r); // QUARK_Task_Flag_Set(&tflags, THREAD_SET_TO_MANUAL_SCHEDULING, (r==0)||(r==1)); QUARK_Insert_Task(q, CORE_H2D, &tflags, sizeof(int), &NB, VALUE, sizeof(int), &NB, VALUE, sizeof(double), H+r*NB*NB, INPUT, sizeof(int), &NB, VALUE, sizeof(double), D+r*NB*NB, OUTPUT, sizeof(int), &NB, VALUE, 0); QUARK_Insert_Task(q, CORE_D2H, &tflags, sizeof(int), &NB, VALUE, sizeof(int), &NB, VALUE, sizeof(double), D+r*NB*NB, INPUT, sizeof(int), &NB, VALUE, sizeof(double), H+r*NB*NB, OUTPUT, sizeof(int), &NB, VALUE, 0); } } QUARK_Barrier(q); offload_Free(D, 0); free(H); }
/***************************************************************************//** * Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler **/ void plasma_pzhbrdt_quark(PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; #ifdef COMPLEX static PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0; static double dzero = (double) 0.0; PLASMA_Complex64_t ztmp; double absztmp; #endif PLASMA_Complex64_t *C, *S; int blksweep, lcsweep, blkid, lcNB; int N, NB, NT, grsiz, lcgrsiz; int i; size_t eltsize = plasma_element_size(A.dtyp); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); NT = A.nt; N = A.m; NB = A.mb; /* Quick return */ if (N == 0){ return; } if (NB == 0) { memset(D, 0, N*sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX for (i=0; i<N; i++) D[i] = cabs(*A(i,i)); #else for (i=0; i<N; i++) D[i] = *A(i,i); #endif return; } /* * Barrier is used because the bulge have to wait until * the reduction to band has been finish. * otherwise, I can remove this BARRIER when I integrate * the function dependencies link inside the reduction to * band. Keep in min the case when NB=1, where no bulge-chasing. */ /***************************************************************/ QUARK_Barrier(plasma->quark); tblg = -Wtimming(); /***************************************************************/ /* * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge. * Make diagonal and superdiagonal elements real, storing them in * D and E. if PlasmaLower, first transform lower bidiagonal form * to upper bidiagonal by applying plane rotations/ Householder * from the left, overwriting superdiagonal elements then make * elements real of the resulting upper Bidiagonal. if PlasmaUpper * then make its elements real. For Q, PT: ZSCAL should be done * in case of WANTQ. */ if (NB == 1){ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i<N; i++) { D[i] = creal( *A(i, i) ); /* diag value */ if( i < (N-1)) { /* lower off-diag value */ ztmp = *A((i+1),i); absztmp = cabs(ztmp); *A((i+1),i) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (PLASMA_Complex64_t) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ */ } } } else { /* PlasmaUpper */ for (i=0; i<N; i++) { D[i] = creal( *A(i,i) ); /* diag value*/ if(i<(N-1)) { /* lower off-diag value */ ztmp = *A(i, (i+1)); absztmp = cabs(ztmp); *A(i,(i+1)) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (PLASMA_Complex64_t) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */ } } } /* end PlasmaUpper*/ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif return; } /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */ if( N <= 0 ) /* this will be removed we don t need it. */ { PLASMA_Complex64_t *work, *TTau; int info, ldwork = N*N; work = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, ldwork, PlasmaComplexDouble); TTau = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); info = LAPACKE_zhetrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, A(0,0), A.lm, D, E, TTau, work, ldwork); plasma_shared_free(plasma, (void*) work); plasma_shared_free(plasma, (void*) TTau); if( info == 0 ) sequence->status = PLASMA_SUCCESS; else plasma_sequence_flush(plasma->quark, sequence, request, info); return; } /* General case NB > 1 && N > NB */ C = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); S = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N, PlasmaComplexDouble); /*************************************************************************** * START BULGE CHASING CODE **************************************************************************/ /* * Initialisation of local parameter. those parameter should be * input or tuned parameter. */ grsiz = 1; if( NB > 160 ) { grsiz = 1; } else if( NB > 100 ) { grsiz = 1; /* if( N < 5000 ) grsiz = 1; else grsiz = 2; */ } else { grsiz = 2; } grsiz = max(1, grsiz); /*grsiz=1;*/ /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d A.ln %5d A.nb %5d \n",N,NB,lcNB,grsiz,A.ln,A.nb);*/ for (blksweep = 0; blksweep<NT; blksweep++){ lcNB = blksweep == NT-1 ? A.n-blksweep*A.nb : A.nb; /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d blksweep%5d NT %5d \n",N,NB,lcNB,grsiz,blksweep,NT);*/ for (lcsweep = 0; lcsweep<lcNB; lcsweep++){ for (blkid = blksweep; blkid<NT; blkid=blkid+grsiz){ lcgrsiz = (blkid+1) < NT ? grsiz : NT-blkid; /*printf(" Version -dp- N %5d NB %5d lcNB %5d grsiz %5d lcgrsiz %5d blkid %5d \n",N,NB,lcNB,grsiz,lcgrsiz,blkid);*/ QUARK_CORE_ztrdalg_v2( plasma->quark, &task_flags, uplo, &A, C, S, lcgrsiz, lcsweep, blkid, blksweep); } } } /* * Barrier used only for now, to be sure that everything * is done before copying the D and E and free workspace. * this will be removed later when D and E are directly filled * during the bulge process. */ QUARK_Barrier(plasma->quark); tblg += Wtimming(); printf(" done with bulge %lf \n\n\n",tblg); plasma_shared_free(plasma, (void*) C); plasma_shared_free(plasma, (void*) S); /* * STORE THE RESULTING diagonal/off-diagonal in D AND E */ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i < N-1 ; i++) { D[i] = creal( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if(i<(N-2)) E[i] = creal(*A(i+1, i)); else E[i] = cabs( *A(i+1, i)); } D[i] = creal( *A(i, i) ); } else { /* PlasmaUpper */ for (i=0; i<N-1; i++) { D[i] = creal( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if( i < (N-2) ) E[i] = creal(*A(i, (i+1))); else E[i] = cabs(*A(i, (i+1))); } D[i] = creal( *A(i, i) ); } /* end PlasmaUpper */ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif } /* END FUNCTION */
/***************************************************************************//** * Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler **/ void plasma_pdsbrdt_quark(PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request) { plasma_context_t *plasma; Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer; #ifdef COMPLEX static double zone = (double) 1.0; static double dzero = (double) 0.0; double ztmp; double absztmp; #endif double *C, *S; int N, NB, INgrsiz, INthgrsiz, BAND; int myid, grsiz, shift=3, stt, st, ed, stind, edind; int blklastind, colpt, PCOL, ACOL, MCOL; int stepercol, mylastid, grnb, grid; int *DEP,*MAXID; int i, j, m; int thgrsiz, thgrnb, thgrid, thed; size_t eltsize = plasma_element_size(A.dtyp); plasma = plasma_context_self(); if (sequence->status != PLASMA_SUCCESS) return; QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence); N = A.m; NB = A.mb; /* Quick return */ if (N == 0){ return; } if (NB == 0) { memset(D, 0, N*sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX for (i=0; i<N; i++) D[i] = fabs(*A(i,i)); #else for (i=0; i<N; i++) D[i] = *A(i,i); #endif return; } /* * Barrier is used because the bulge have to wait until * the reduction to band has been finish. * otherwise, I can remove this BARRIER when I integrate * the function dependencies link inside the reduction to * band. Keep in min the case when NB=1, where no bulge-chasing. */ /***************************************************************/ QUARK_Barrier(plasma->quark); tblg = -Wtimming(); /***************************************************************/ /* * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge. * Make diagonal and superdiagonal elements real, storing them in * D and E. if PlasmaLower, first transform lower bidiagonal form * to upper bidiagonal by applying plane rotations/ Householder * from the left, overwriting superdiagonal elements then make * elements real of the resulting upper Bidiagonal. if PlasmaUpper * then make its elements real. For Q, PT: ZSCAL should be done * in case of WANTQ. */ if (NB == 1){ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i<N; i++) { D[i] = ( *A(i, i) ); /* diag value */ if( i < (N-1)) { /* lower off-diag value */ ztmp = *A((i+1),i); absztmp = fabs(ztmp); *A((i+1),i) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (double) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ */ } } } else { /* PlasmaUpper */ for (i=0; i<N; i++) { D[i] = ( *A(i,i) ); /* diag value*/ if(i<(N-1)) { /* lower off-diag value */ ztmp = *A(i, (i+1)); absztmp = fabs(ztmp); *A(i,(i+1)) = absztmp; E[i] = absztmp; if(absztmp != dzero) ztmp = (double) (ztmp / absztmp); else ztmp = zone; if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp; /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */ } } } /* end PlasmaUpper*/ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif return; } /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */ if( N <= 0 ) /* this will be removed we don t need it. */ { double *work, *TTau; int info, ldwork = N*N; work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble); TTau = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); info = LAPACKE_dsytrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N, A(0,0), A.lm, D, E, TTau, work, ldwork); plasma_shared_free(plasma, (void*) work); plasma_shared_free(plasma, (void*) TTau); if( info == 0 ) sequence->status = PLASMA_SUCCESS; else plasma_sequence_flush(plasma->quark, sequence, request, info); return; } /* General case NB > 1 && N > NB */ DEP = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger ); MAXID = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger ); C = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); S = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble); memset(MAXID,0,(N+1)*sizeof(int)); /*************************************************************************** * START BULGE CHASING CODE **************************************************************************/ /* * Initialisation of local parameter. those parameter should be * input or tuned parameter. */ INgrsiz = 1; if( NB > 160 ) { INgrsiz = 2; } else if( NB > 100 ) { if( N < 5000 ) INgrsiz = 2; else INgrsiz = 4; } else { INgrsiz = 6; } INthgrsiz = N; BAND = 0; grsiz = INgrsiz; thgrsiz = INthgrsiz; if( grsiz == 0 ) grsiz = 6; if( thgrsiz == 0 ) thgrsiz = N; i = shift/grsiz; stepercol = i*grsiz == shift ? i:i+1; i = (N-2)/thgrsiz; thgrnb = i*thgrsiz == (N-2) ? i:i+1; for (thgrid = 1; thgrid<=thgrnb; thgrid++){ stt = (thgrid-1)*thgrsiz+1; thed = min( (stt + thgrsiz -1), (N-2)); for (i = stt; i <= N-2; i++){ ed=min(i,thed); if(stt>ed)break; for (m = 1; m <=stepercol; m++){ st=stt; for (j = st; j <=ed; j++){ /* PCOL: dependency on the ID of the master of the group of the previous column. (Previous Column:PCOL). */ /* ACOL: dependency on the ID of the master of the previous group of my column. (Acctual Column:ACOL). (it is 0(NULL) for myid=1) */ /* MCOL: OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */ myid = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1; mylastid = myid+grsiz-1; PCOL = mylastid+shift-1; /* to know the dependent ID of the previous column. need to know the master of its group */ MAXID[j] = myid; PCOL = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/ grnb = PCOL/grsiz; grid = grnb*grsiz == PCOL ? grnb:grnb+1; PCOL = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */ ACOL = myid-grsiz; if(myid==1)ACOL=0; MCOL = myid; QUARK_CORE_dtrdalg( plasma->quark, &task_flags, uplo, N, NB, &A, C, S, i, j, m, grsiz, BAND, DEP(PCOL), DEP(ACOL), DEP(MCOL) ); if(mylastid%2 ==0){ blklastind = (mylastid/2)*NB+1+j-1; }else{ colpt = ((mylastid+1)/2)*NB + 1 +j -1 ; stind = colpt-NB+1; edind = min(colpt,N); if( (stind>=edind-1) && (edind==N) ) blklastind=N; else blklastind=0; } if(blklastind >= (N-1)) stt=stt+1; } /* END for j=st:ed */ } /* END for m=1:stepercol */ } /* END for i=1:MINMN-2 */ } /* END for thgrid=1:thgrnb */ /* * Barrier used only for now, to be sure that everything * is done before copying the D and E and free workspace. * this will be removed later when D and E are directly filled * during the bulge process. */ QUARK_Barrier(plasma->quark); tblg += Wtimming(); //printf(" done with bulge %lf \n\n\n",tblg); plasma_shared_free(plasma, (void*) DEP); plasma_shared_free(plasma, (void*) MAXID); plasma_shared_free(plasma, (void*) C); plasma_shared_free(plasma, (void*) S); /* * STORE THE RESULTING diagonal/off-diagonal in D AND E */ memset(D, 0, N *sizeof(double)); memset(E, 0, (N-1)*sizeof(double)); /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if(uplo==PlasmaLower){ for (i=0; i < N-1 ; i++) { D[i] = ( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if(i<(N-2)) E[i] = (*A(i+1, i)); else E[i] = fabs( *A(i+1, i)); } D[i] = ( *A(i, i) ); } else { /* PlasmaUpper */ for (i=0; i<N-1; i++) { D[i] = ( *A(i,i) ); /* * Alternative for Householder case, all off-diag * are real except the last off-diag, where we * have to take the abs */ if( i < (N-2) ) E[i] = (*A(i, (i+1))); else E[i] = fabs(*A(i, (i+1))); } D[i] = ( *A(i, i) ); } /* end PlasmaUpper */ #else if( uplo == PlasmaLower ){ for (i=0; i < N-1; i++) { D[i] = *A(i, i); E[i] = *A(i+1, i); } D[i] = *A(i, i); } else { for (i=0; i < N-1; i++) { D[i] = *A(i, i ); E[i] = *A(i, i+1); } D[i] = *A(i, i); } #endif } /* END FUNCTION */
double Cholesky(Quark *quark, double *A, int N, int NB, int LDA, size_t memsize) { #define A(ib,jb) A[(size_t)(jb)*NB*LDA+(ib)*NB] #ifndef USE_MIC cublasStatus cu_status; #endif int bb = (N + NB - 1) / NB; int YM, YN; int Ym, Yn; int JB; int jb, jjb; int memBlock = memsize/sizeof(double)/NB/NB; double *X, *Y; #ifdef USE_MIC Y = (double*) offload_Alloc((size_t)memBlock*NB*NB*sizeof(double), 0); assert(Y != NULL); #else #ifdef USE_CUBLASV2 { cudaError_t ierr; ierr = cudaMalloc((void **) &Y, (size_t) memBlock*NB*NB*sizeof(double)); assert(ierr == cudaSuccess); } #else cu_status = cublasAlloc((size_t) memBlock*NB*NB, sizeof(double), (void **) &Y); CHKERR(cu_status); #endif #endif double t1; double llttime = MPI_Wtime(); /*--------------------------------------*/ /* The main Ypanel loop */ // QUARK_Barrier(quark); for (JB = 0, jb = 0; JB < N; JB+=YN, jb+=Yn) { //determine size of Ypanel Ym = bb - jb; Yn = find_Yn(bb, memBlock, jb); YM = N - JB; YN = MIN((jb+Yn)*NB, N) - jb*NB; X = Y + (size_t)(memBlock-Ym)*NB*NB; printf("bb %d jb %d YM %d YN %d Ym %d Yn %d Y %p X %p\n", bb, jb, YM, YN, Ym, Yn, Y, X); /* Copy in data */ A2Y(quark, &A(jb,jb), Y, LDA, NB, YM, YN); /* Left-looking */ for(jjb = 0; jjb < jb; jjb++){ /* copy from A to X */ A2X(quark, &A(jb,jjb), LDA, X, NB, YM); ooc_syrk(quark, X, Y, YM, YN, NB); } /* incore factorization */ ooc_incore(quark, &A(jb,jb), Y, LDA, NB, YM, YN); /* Copy out data */ // Y2A(quark, Y, &A(jb,jb), LDA, NB, YM, YN); // QUARK_Barrier(quark); // reduce parallelism // goto oasdfh; // early stop } oasdfh: QUARK_Barrier(quark); llttime = MPI_Wtime() - llttime; printf("llt time %lf %lf\n", llttime, MPI_Wtime()); printf("%lf %lf\n", A[(N-1)*LDA+N-1], MPI_Wtime()); /*--------------------------------------*/ #ifdef USE_MIC offload_Free(Y,0); #else #ifdef USE_CUBLASV2 { cudaError_t ierr; ierr = cudaFree((void *) Y); assert(ierr == cudaSuccess); Y = 0; } #else cu_status = cublasFree(Y); CHKERR(cu_status); #endif #endif return llttime; #undef A }
extern "C" magma_int_t magma_zgetrf_mc(magma_context *cntxt, int *m, int *n, cuDoubleComplex *a, int *lda, int *ipiv, int *info) { /* -- MAGMA (version 1.6.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2015 Purpose ======= ZGETRF computes an LU factorization of a general COMPLEX_16 M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= CNTXT (input) MAGMA_CONTEXT CNTXT specifies the MAGMA hardware context for this routine. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ if (cntxt->num_cores == 1 && cntxt->num_gpus == 1) { //int result = magma_zgetrf(*m, *n, a, *lda, ipiv, info); //return result; } int EN_BEE = cntxt->nb; Quark* quark = cntxt->quark; int i,j,l; int ii,jj,ll; void *fakedep; int ione=1; cuDoubleComplex fone = MAGMA_Z_ONE; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; int M,N,MM,NN,MMM,K; int priority=0; *info = 0; int nb = (EN_BEE==-1)? magma_get_zpotrf_nb(*n): EN_BEE; /* Check arguments */ if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < max(1,*m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } int k = min(*m,*n); int iinfo[2]; iinfo[1] = 0; char label[10000]; ii = -1; /* Loop across diagonal blocks */ for (i = 0; i < k; i += nb) { ii++; jj = -1; priority = 10000 - ii; /* Update panels in left looking fashion */ for (j = 0; j < i; j += nb) { jj++; NN=min(nb,(*n)-i); MM=min(nb,(*m)-j); l = j + nb; MMM = min(nb,(*m)-l); sprintf(label, "UPDATE %d %d", ii, jj); QUARK_Insert_Task(quark, SCHED_panel_update, 0, sizeof(int), &NN, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i), INOUT, sizeof(int), lda, VALUE, sizeof(int), &MM, VALUE, sizeof(cuDoubleComplex)*nb, &ipiv[j], INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,j), INPUT, sizeof(int), &MMM, VALUE, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j), INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i), INOUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), OUTPUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "cyan", VALUE | TASKCOLOR, 0); ll = jj + 1; /* Split gemm into tiles */ for (l = j + (2*nb); l < (*m); l += nb) { ll++; MMM = min(nb,(*m)-l); fakedep = (void *)(intptr_t)(j+1); sprintf(label, "GEMM %d %d %d", ii, jj, ll); QUARK_Insert_Task(quark, SCHED_zgemm, 0, sizeof(int), &MMM, VALUE, sizeof(int), &NN, VALUE, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j), INPUT, sizeof(int), lda, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i), INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i), INOUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), OUTPUT | GATHERV, sizeof(void*), fakedep, OUTPUT | GATHERV, strlen(label)+1, label, VALUE | TASKLABEL, 5, "blue", VALUE | TASKCOLOR, 0); } } M=(*m)-i; N=min(nb,(*n)-i); iinfo[0] = i; sprintf(label, "GETRF %d", ii); QUARK_Insert_Task(quark, SCHED_zgetrf, 0, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), INOUT, sizeof(int), lda, VALUE, sizeof(cuDoubleComplex)*nb, &ipiv[i], OUTPUT, sizeof(int), iinfo, OUTPUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } K = (*m)/nb; if ((K*nb)==(*m)) { ii = K - 1; K = *m; } else { ii = k; K = (K+1)*nb; } priority = 0; /* If n > m */ for (i = K; i < (*n); i += nb) { ii++; jj = -1; /* Update remaining panels in left looking fashion */ for (j = 0; j < (*m); j += nb) { jj++; NN=min(nb,(*n)-i); MM=min(nb,(*m)-j); l = j + nb; MMM = min(nb,(*m)-l); sprintf(label, "UPDATE %d %d", ii, jj); QUARK_Insert_Task(quark, SCHED_panel_update, 0, sizeof(int), &NN, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i), INOUT, sizeof(int), lda, VALUE, sizeof(int), &MM, VALUE, sizeof(cuDoubleComplex)*nb, &ipiv[j], INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,j), INPUT, sizeof(int), &MMM, VALUE, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j), INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i), INOUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), OUTPUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "cyan", VALUE | TASKCOLOR, 0); ll = jj + 1; /* Split gemm into tiles */ for (l = j + (2*nb); l < (*m); l += nb) { ll++; MMM = min(nb,(*m)-l); fakedep = (void *)(intptr_t)(j+1); sprintf(label, "GEMM %d %d %d", ii, jj, ll); QUARK_Insert_Task(quark, SCHED_zgemm, 0, sizeof(int), &MMM, VALUE, sizeof(int), &NN, VALUE, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j), INPUT, sizeof(int), lda, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i), INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i), INOUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), OUTPUT | GATHERV, sizeof(void*), fakedep, OUTPUT | GATHERV, strlen(label)+1, label, VALUE | TASKLABEL, 5, "blue", VALUE | TASKCOLOR, 0); } } } ii = -1; /* Swap behinds */ for (i = 0; i < k; i += nb) { ii++; jj = -1; MM = min(nb,(*m)-i); MM = min(MM,(*n)-i); for (j = 0; j < i; j += nb) { jj++; fakedep = (void *)(intptr_t)(j+1); sprintf(label, "LASWPF %d %d", ii, jj); QUARK_Insert_Task(quark, SCHED_zlaswp, 0, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,j), INOUT, sizeof(int), lda, VALUE, sizeof(int), &MM, VALUE, sizeof(cuDoubleComplex)*nb, &ipiv[i], INPUT, sizeof(int), &priority, VALUE | TASK_PRIORITY, sizeof(void*), fakedep, INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(i+nb,j), OUTPUT, strlen(label)+1, label, VALUE | TASKLABEL, 7, "purple", VALUE | TASKCOLOR, 0); } } /* Synchronization point */ QUARK_Barrier(quark); /* Fix pivot */ ii = -1; for (i = 0; i < k; i +=nb) { ii++; for (j = 0; j < min(nb,(k-i)); j++) { ipiv[ii*nb+j] += ii*nb; } } QUARK_Barrier(quark); }
extern "C" magma_int_t magma_zpotrf_mc(magma_context *cntxt, char *uplo, magma_int_t *n, cuDoubleComplex *a, magma_int_t *lda, magma_int_t *info) { /* -- MAGMA (version 1.5.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date September 2014 Purpose ======= ZPOTRF computes the Cholesky factorization of a Hermitian positive definite matrix A. The factorization has the form A = U**T * U, if UPLO = 'U', or A = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= CNTXT (input) MAGMA_CONTEXT CNTXT specifies the MAGMA hardware context for this routine. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ if (cntxt->num_cores == 1 && cntxt->num_gpus == 1) { //magma_int_t result = magma_zpotrf(*uplo, *n, a, *lda, info); //return result; } // check arguments magma_int_t upper = (magma_int_t) lsame_(uplo, "U"); *info = 0; if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < max(1,*n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } Quark* quark = cntxt->quark; // get block size magma_int_t nb = (cntxt->nb ==-1)? magma_get_zpotrf_nb(*n): cntxt->nb; magma_int_t i,j,k; magma_int_t ii,jj,kk; magma_int_t temp,temp2,temp3; char label[10000]; magma_int_t iinfo[2]; iinfo[1] = 0; ii = -1; // traverse diagonal blocks for (i = 0; i < (*n); i += nb) { ii++; temp2 = min(nb,(*n)-i); // if not first block if (i > 0) { // first do large syrk, then split if (i < (*n)/2) { sprintf(label, "SYRK %d", ii); if (upper) { QUARK_Insert_Task(quark, SCHED_zsyrk, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(magma_int_t), &i, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(0,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, sizeof(cuDoubleComplex)*(*n)*(*n), A(i-nb,i), INPUT, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } else { QUARK_Insert_Task(quark, SCHED_zsyrk, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(magma_int_t), &i, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,0), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i-nb), INPUT, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } } else { jj = -1; // split syrk into tiles for (j = 0; j < i; j += nb) { jj++; sprintf(label, "SYRK %d %d", ii, jj); if (upper) { QUARK_Insert_Task(quark, SCHED_zsyrk, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(j,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } else { QUARK_Insert_Task(quark, SCHED_zsyrk, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,j), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } } } // if not last block if (i < ((*n)-nb)) { jj = -1; // split gemm into tiles for (j = i+nb; j < (*n); j += nb){ jj++; kk = -1; for (k = 0; k < i; k += nb) { kk++; temp = min(nb,(*n)-j); sprintf(label, "GEMM %d %d %d", ii, jj, kk); if (upper) { QUARK_Insert_Task(quark, SCHED_zgemm, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(magma_int_t), &temp, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(k,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(k,j), INPUT, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,j), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "blue", VALUE | TASKCOLOR, 0); } else { QUARK_Insert_Task(quark, SCHED_zgemm, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(j,k), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,k), INPUT, sizeof(cuDoubleComplex)*(*n)*(*n), A(j,i), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "blue", VALUE | TASKCOLOR, 0); } } } } } iinfo[0] = i; sprintf(label, "POTRF %d", ii); QUARK_Insert_Task(quark, SCHED_zpotrf, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, sizeof(magma_int_t), lda, VALUE, sizeof(magma_int_t), iinfo, OUTPUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "cyan", VALUE | TASKCOLOR, 0); // if not last block if (i < ((*n)-nb)) { // split trsm into tiles for (j = i + nb; j < (*n); j += nb) { temp = min(nb,(*n)-j); sprintf(label, "TRSM %d", ii); if (upper) { QUARK_Insert_Task(quark, SCHED_ztrsm, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(magma_int_t), &temp, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,j), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 4, "red", VALUE | TASKCOLOR, 0); } else { QUARK_Insert_Task(quark, SCHED_ztrsm, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(j,i), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 4, "red", VALUE | TASKCOLOR, 0); } } } } QUARK_Barrier(quark); }
/* Try various ways to do matmul and time them. Tiled algorithms * running serially; multi-threaded QUARK runtime with tiled * algorithms; and direct serial computation over standard layout. */ int main_algorithm(int NB, int N, int THREADS) { int i, j, k, nerr=0; int BB = N/NB; double *A = (double*)malloc(N*N*sizeof(double)); double *Ablk = (double*)malloc(N*N*sizeof(double)); double *B = (double*)malloc(N*N*sizeof(double)); double *Bblk = (double*)malloc(N*N*sizeof(double)); double *C_direct = (double*)malloc(N*N*sizeof(double)); double *C = (double*)malloc(N*N*sizeof(double)); double *Cblk = (double*)malloc(N*N*sizeof(double)); double *C_quark = (double*)malloc(N*N*sizeof(double)); double *C_quark_blk = (double*)malloc(N*N*sizeof(double)); struct timeval tstart, tend, tdiff; double t_blk=0, t_quark=0, t_direct=0; // Initialize for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { A[i+j*N] = (double)1.0+i; B[i+j*N] = (double)2.0+i+j; C_quark[i+j*N] = C_direct[i+j*N] = C[i+j*N] = 3.0; } } matrix_print("Printing A", A, N); matrix_print("Printing B", B, N); matrix_print("Printing C before computation", C, N); // Move from F77 to BDL std_to_bdl( A, Ablk, N, NB ); std_to_bdl( B, Bblk, N, NB ); std_to_bdl( C, Cblk, N, NB ); std_to_bdl( C_quark, C_quark_blk, N, NB ); /* ORIGINAL TILED ROUTINE */ /* This is the code for the serial tile-by-tile multiplication */ printf("Doing matrix multiplication using serial tile-by-tile algorithm\n"); gettimeofday( &tstart, NULL ); for (i = 0; i < BB; i++) for (j = 0; j < BB; j++) for (k = 0; k < BB; k++) matmul ( &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &Cblk[NB*NB*i + NB*NB*BB*j], NB); gettimeofday( &tend, NULL ); t_blk = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 ); bdl_to_std( C, Cblk, N, NB ); matrix_print("Printing C produced by serial tile-algorithm after computation", C, N); printf("\n"); /* QUARK PARALLEL TILED ROUTINE */ /* This is the code for the QUARK runtime do do the parallel multi-threaded tile-by-tile algorithm */ printf("Doing matrix multiplication using the multi-threaded QUARK runtime for a tile based algorithm\n"); Quark *quark = QUARK_New(THREADS); gettimeofday( &tstart, NULL ); for (i = 0; i < BB; i++) for (j = 0; j < BB; j++) for (k = 0; k < BB; k++) matmul_quark_call ( quark, &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &C_quark_blk[NB*NB*i + NB*NB*BB*j], NB); QUARK_Barrier( quark ); gettimeofday( &tend, NULL ); t_quark = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 ); QUARK_Delete(quark); bdl_to_std( C_quark, C_quark_blk, N, NB ); matrix_print("Printing C produced by QUARK runtime after computation", C_quark, N); printf("\n"); /* DIRECT COMPUTATION OVER STANDARD LAYOUT */ /* Compute direct C if desired */ printf("Doing matrix multiplication using direct loops (ie, view matrix as one big tile)\n"); gettimeofday( &tstart, NULL ); matmul ( A, B, C_direct, N ); gettimeofday( &tend, NULL ); t_direct = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", (double)(tdiff.tv_sec + (double)tdiff.tv_usec/1000000) ); matrix_print("Printing C produced by direct matmul after computation", C_direct, N); printf("\n"); /* Check for errors */ printf("Comparing result matrices (direct versus QUARK)\n"); nerr = matrix_compare( C_direct, C_quark, N ); printf("Number of differences: %d\n", nerr); printf("\n"); printf("Summary of time taken\n"); printf("Direct SerialBlock QUARK(%d threads)\n", THREADS); printf("%-12.5f %-12.5f %-12.5f\n", t_direct, t_blk, t_quark); free(A); free(Ablk); free(B); free(Bblk); free(C); free(Cblk); free(C_direct); free(C_quark); free(C_quark_blk); return 0; }