uint32_t ADI_FindContours( uint8_t *pImage, uint32_t nWidth, uint32_t nHeight ) { uint32_t i, pixels; uint32_t j, CONTOURCount; uint32_t nNumSegments; ADI_ITB_MODULE_STATUS nReturnValue; uint32_t index,nColour; ADI_CONTOUR_DYNAMIC_MEM_ALLOC *pDMA; ADI_CONT_SEGMENT_HDR *pSegmentListHdr; ADI_CONT_SEGMENT_HDR *pSegmentListTempHdr; ADI_CONT_SEGMENT_HDR_ROW_WISE *pSegmentListRowWiseHdr; //------------------------------------------------------------------------------------- // ADI BF60x Dice-Counting Demo Developed by Berkeley Design Technology, Inc. (BDTI) // ------------------------------------------------------------------------------------ uint32_t nArea, nBoxArea, nSymetry, nRatio; pDMA = &oDMA; pSegmentListHdr = NULL; pSegmentListTempHdr = NULL; pSegmentListRowWiseHdr = NULL; CONTOURCount = 0; nTotalRLENodes = 0; index =0; oCONTOURInfo. nNumberOfObjects =0; #ifdef PROFILE goProfileFunc.nSum = 0; PROFBEG(goProfileFunc); #endif if( InitMemory ) { nReturnValue = adi_DynamicMemInit(pDMA, aTempBuffer, TEMP_BUFFER_SIZE, nHeight*4); if ((nReturnValue & 0xFFFF) != (ADI_ITB_STATUS_SUCCESS)) { printf("adi_DynamicMemInit - Not enough memory\n"); //InitMemory = true; return (0); } pRunLenListHdr = (ADI_CONT_RUN_LEN_HDR *)pDMA->pMemAlloc(pDMA, ADI_CONTOUR_RUN_LEN_HDR, nHeight); if (pRunLenListHdr == NULL) { printf("pRunLenListHdr - Not enough memory\n"); //InitMemory = true; return (0); } pBoundingRectangle = (ADI_CONT_BOUNDING_RECTANGLE *)pDMA->pMemAlloc(pDMA, ADI_CONTOUR_BOUNDING_RECT, 1); if (pBoundingRectangle == NULL) { printf("pBoundingRectangle - Not enough memory\n"); //InitMemory = true; return (0); } InitMemory = false; } #ifdef PROFILE PROFEND(goProfileFunc); printf("adi_DynamicMemInit time(ms) = %d\n", goProfileFunc.nSum/500000); #endif #ifdef PROFILE goProfileFunc.nSum = 0; PROFBEG(goProfileFunc); #endif #ifdef PROFILE PROFEND(goProfileFunc); printf("Allocating memory time(ms) = %d\n", goProfileFunc.nSum/500000); #endif #ifdef PROFILE goProfileFunc.nSum = 0; PROFBEG(goProfileFunc); #endif #if 1 nReturnValue = ADIContoursRLEWrapper( pDMA, (int8_t *)pImage, nWidth*nHeight, pRunLenListHdr, nWidth, nHeight ); #else // L3 for (j = 0; j < nHeight; j++) { pRunLenListHdr[j].pNext = NULL; nReturnValue = adi_contour_RLE(pDMA, (uint8_t *)pImage + j * nWidth, nWidth, j, &pRunLenListHdr[j]); } #endif if ((nReturnValue & 0xFFFF) != (ADI_ITB_STATUS_SUCCESS)) { if ((nReturnValue & 0xFFFF) == (ADI_ITB_STATUS_CONT_NO_RUN_LEN)) { printf("Not enough memory\n"); printf("\tNot enough ADI_CONT_RUN_LEN_NODE nodes created.\nEdit MAX_NUM_RUN_LEN_NODE in adi_contour_mem_alloc.c to increase number of nodes created\n"); } else { printf("Error!!!\n"); } pDMA->pMemReset(pDMA, NULL, ADI_CONTOUR_RUN_LEN_NODE, 0); return (0); } #ifdef PROFILE PROFEND(goProfileFunc); printf("adi_contour_RLE time(ms) = %d\n", goProfileFunc.nSum/500000); #endif #ifdef PROFILE goProfileFunc.nSum = 0; PROFBEG(goProfileFunc); #endif nReturnValue = adi_contour_segment_image(pDMA, pRunLenListHdr, nHeight, &pSegmentListHdr, &nNumSegments); if ((nReturnValue & 0xFFFF) != (ADI_ITB_STATUS_SUCCESS)) { if ((nReturnValue & 0xFFFF) == (ADI_ITB_STATUS_CONT_NO_SEG_HDR)) { printf("Not enough memory\n"); printf("\tNot enough ADI_CONTOUR_SEG_HDR nodes created.\nEdit MAX_NUM_SEGMENT_HDR in adi_contour_mem_alloc.c to increase number of nodes created\n"); } else if ((nReturnValue & 0xFFFF) == (ADI_ITB_STATUS_CONT_NO_EQU_HDR)) { printf("Not enough memory\n"); printf("\tNot enough ADI_CONTOUR_EQU_SEG_HDR nodes created.\nEdit MAX_NUM_EQU_SEGMENT_HDR in adi_contour_mem_alloc.c to increase number of nodes created\n"); } else if ((nReturnValue & 0xFFFF) == (ADI_ITB_STATUS_CONT_NO_EQUNODE)) { printf("Not enough memory\n"); printf("\tNot enough ADI_CONTOUR_EQU_SEG_NODE nodes created.\nEdit MAX_NUM_EQU_SEGMENT_NODE in adi_contour_mem_alloc.c to increase number of nodes created\n"); } else { printf("Error!!!\n"); } pDMA->pMemReset(pDMA, NULL, ADI_CONTOUR_RUN_LEN_NODE, 0); pDMA->pMemReset(pDMA, NULL, ADI_CONTOUR_SEG_HDR, 0); pSegmentListHdr = NULL; pDMA->pMemReset(pDMA, NULL, ADI_CONTOUR_EQU_SEG_HDR, 0); return (0); } #ifdef PROFILE PROFEND(goProfileFunc); printf("adi_contour_segment_image time(ms) = %d\n", goProfileFunc.nSum/500000); #endif for(pSegmentListTempHdr = pSegmentListHdr ;pSegmentListTempHdr; ) { #ifdef PROFILE goProfileFunc.nSum = 0; PROFBEG(goProfileFunc); #endif adi_contour_BoundingRectangle(pSegmentListTempHdr, pBoundingRectangle); #ifdef PROFILE PROFEND(goProfileFunc); printf("Bounding Rectange time(ms) = %d\n", goProfileFunc.nSum/500000); #endif #ifdef PROFILE goProfileFunc.nSum = 0; PROFBEG(goProfileFunc); #endif pSegmentListRowWiseHdr = (ADI_CONT_SEGMENT_HDR_ROW_WISE *)pDMA->pMemAlloc(pDMA, ADI_CONTOUR_SEG_HDR_ROW_WISE, (pBoundingRectangle->nHeight)); if (pSegmentListRowWiseHdr == NULL) { printf("pSegmentListRowWiseHdr - Not enough memory\n"); pDMA->pMemFree(pDMA, pSegmentListRowWiseHdr, ADI_CONTOUR_SEG_HDR_ROW_WISE, (pBoundingRectangle->nHeight)); return (0XFFFFFFFF); } adi_contour_SplitSegmentRowWise(pSegmentListTempHdr, pSegmentListRowWiseHdr, pBoundingRectangle); #ifdef PROFILE PROFEND(goProfileFunc); printf("SplitSegmentRowWise+adi_ContourArea time(ms) = %d\n", goProfileFunc.nSum/500000); #endif //------------------------------------------------------------------------------------------------- // // Insert Contour classifier code here. // // Conceptually similar to boundingRect() in OpenCV // adi_contour_BoundingRectangle() returns the bounding rectangle of a contour. // The dimensions are in pixels. // pBoundingRectangle->nHeight = Height of the rectangle bounding the contour // pBoundingRectangle->nWidth = Width of the rectangle bounding the contour // // Conceptually similar to contourArea() in OpenCV // adi_ContourArea() returns the area of a contour. // The area returned is in pixels^2 // // oDotInfo is a structure used by the graphics module to display bounding boxes // see Graphics.c // nColour is the bonding box color in ARGB format (0x7FRRGGBB) //------------------------------------------------------------------------------------------------- //------------------------------------------------------------------------------------- // ADI BF60x Dice-Counting Demo Developed by Berkeley Design Technology, Inc. (BDTI) // ------------------------------------------------------------------------------------ nArea = adi_ContourArea( pSegmentListRowWiseHdr, pBoundingRectangle->nHeight ); nBoxArea = pBoundingRectangle->nHeight*pBoundingRectangle->nWidth; nSymetry = (10*pBoundingRectangle->nHeight)/pBoundingRectangle->nWidth; //--------------------------------------------------------------------------------------------- // // BDTI Full Dice Dot Classifier Cascade - 02/18/2013 // // 1st level - Filter out circles that are very big or very small // 2nd level - Ratio of calculated area of inscribed circle to measured contour area // Calculated area of inscribed circle = (((contour bounding box area)*PI)/4) // Measured area of arbitrary contour. // Ratio = ((nBoxArea*314)/4)/nArea = 100*((Box Area *PI)/(4*Circle area)) // 3rd level - Symmetry of contour (bounding box height / bounding box width). // (10*pBoundingRectangle->nHeight)/pBoundingRectangle->nWidth; // // If the contour passes through all three levels, it is classified as a Dice Dot and counted. // //--------------------------------------------------------------------------------------------- nColour = 0x7F000000; // Black - Used for Demo Mode 1 if( (nArea >= CIRCLEAREAMIN) && (nArea <= CIRCLEAREAMAX ) ) { nRatio = (nBoxArea*314)/4; nRatio /= nArea; // nRatio = ((Box Area * 314)/4)/nArea = 100*((Box Area *PI)/(4*Circle area)) nColour = 0x7FFF0000; // RED - Used for Demo Mode 1 //adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, nBoxArea/10 ); //adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, nArea/10 ); //adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, nRatio ); if( (nRatio >= MIN_RATIO) && (nRatio <= MAX_RATIO) ) { nColour = 0x7F0000FF; // Blue - Used for Demo Mode 1 //adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, nSymetry ); //nSymetry ); if( (nSymetry >= MIN_SYMETRY) && (nSymetry <= MAX_SYMETRY) ) { adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, 3 ); nColour = 0X7F00FF00; // Green - Used for Demo Mode 1 CONTOURCount++; } } } //--------------------------------------------------------------------------------------------- // Limit viewable bounding rectangles to 1.5 times MAX box area if( (nBoxArea <= ((BOXAREAMAX*3)/2) ) ) { nColour = 0x7F000000 + (nArea&0x00FFFFFF); if(index < MAX_OBJ_INFO) { oCONTOURInfo.aCONTOURInfo[index].nHeight = pBoundingRectangle->nHeight; oCONTOURInfo.aCONTOURInfo[index].nWidth = pBoundingRectangle->nWidth; oCONTOURInfo.aCONTOURInfo[index].nXBottomRight = pBoundingRectangle->nXBottomRight; oCONTOURInfo.aCONTOURInfo[index].nXTopLeft = pBoundingRectangle->nXTopLeft; oCONTOURInfo.aCONTOURInfo[index].nYBottomRight = pBoundingRectangle->nYBottomRight; oCONTOURInfo.aCONTOURInfo[index].nYTopLeft = pBoundingRectangle->nYTopLeft; oCONTOURInfo.nColour[index++] = nColour; } } #ifdef PROFILE goProfileFunc.nSum = 0; PROFBEG(goProfileFunc); #endif pDMA->pMemFree(pDMA, pSegmentListRowWiseHdr, ADI_CONTOUR_SEG_HDR_ROW_WISE, (pBoundingRectangle->nHeight)); pSegmentListRowWiseHdr = NULL; #ifdef PROFILE PROFEND(goProfileFunc); printf("inside for memfree memreset time(ms) = %d\n", goProfileFunc.nSum/500000); #endif pSegmentListTempHdr = pSegmentListTempHdr->pDown; } // while //---------------------------------------------------------------------------------------- #ifdef PROFILE goProfileFunc.nSum = 0; PROFBEG(goProfileFunc); #endif pDMA->pMemReset(pDMA, pSegmentListHdr, ADI_CONTOUR_SEG_HDR, 0); pDMA->pMemReset(pDMA, pRunLenListHdr, ADI_CONTOUR_RUN_LEN_NODE, nHeight); pSegmentListHdr = NULL; pDMA->pMemReset(pDMA, NULL, ADI_CONTOUR_EQU_SEG_HDR, 0); #ifdef PROFILE PROFEND(goProfileFunc); printf("Memory Cleanup time(ms) = %d\n", goProfileFunc.nSum/500000); #endif oCONTOURInfo. nNumberOfObjects = index; oCONTOURInfo.nCONTOURCount = CONTOURCount; return (CONTOURCount); }
void psgetrf_gpu(int *m_in, int *n_in, float *A, int *ia_in, int *ja_in, int *descA, int *ipiv_, int *info) { int m = *m_in; int n = *n_in; int ia = *ia_in; int ja = *ja_in; const int use_setup_desc = TRUE; const int idebug = 0; int use_replicated_storage = FALSE; const int use_broadcast_triangular_matrix = TRUE; int ia_proc, ja_proc; int lrindx, lcindx, rsrc,csrc, irsrc,icsrc; int ictxt, nprow,npcol, myprow,mypcol; int is_root; int minmn; int k1,k2,incx,ip; int mm, nn, kk, ii, jj, mtmp; int mm_lu,nn_lu,ia_lu,ja_lu; int elemSize = sizeof( float ); size_t nbytes; int nnb, jstart,jend,jsize, isize, jb; int icontxt, isizeAtmp; int i,j, iia,jja, ldA, ldhA; int iinfo = 0; int iAtmp, jAtmp, iha,jha, iib,jjb,iic,jjc; int ldAtmp, ldBtmp, lmm,lnn; int lrA1,lcA1, lrA2,lcA2; int desc_hA_[DLEN_]; int *desc_hA = &(desc_hA_[0]); int *ipiv_hA_ = 0; float *hA = 0; float *Atmp = 0; float *dAtmp = 0; int *gipiv_ = 0; int desc_Atmp_[DLEN_]; int *desc_Atmp = &(desc_Atmp_[0]); cublasStatus cu_status; int isok; int use_delayed_left_interchange = 1; int is_mine; int i1,j1,inc1, i2,j2,inc2; int desc_ipiv_hA_[DLEN_]; int *desc_ipiv_hA = &(desc_ipiv_hA_[0]); int desc_ipiv_[DLEN_]; int *desc_ipiv = &(desc_ipiv_[0]); int desc_gipiv_[DLEN_]; int *desc_gipiv = &(desc_gipiv_[0]); int mb,nb, Locp, Locq, lld; char direc = 'F'; char rowcol = 'R'; char left[] = "Left"; char lower[] = "Lower"; char notrans[] = "NoTrans"; char unit[] = "Unit"; char *side = left; char *uplo = lower; char *trans = notrans; char *diag = unit; float zero_[REAL_PART+IMAG_PART+1]; float *zero = &(zero_[0]); float one_[REAL_PART+IMAG_PART+1]; float *one = &(one_[0]); float neg_one_[REAL_PART+IMAG_PART+1]; float *neg_one = &(neg_one_[0]); float beta_[REAL_PART+IMAG_PART+1]; float *beta = &(beta_[0]); float alpha_[REAL_PART+IMAG_PART+1]; float *alpha = &(alpha_[0]); /* * A is a pointer to GPU device memory but conceptually associated * with a scalapack distributed matrix * A is array of complex numbers */ *info = 0; zero[REAL_PART] = 0.0; zero[IMAG_PART] = 0.0; one[REAL_PART] = 1.0; one[IMAG_PART] = 0.0; neg_one[REAL_PART] = -1.0; neg_one[IMAG_PART] = 0.0; /* * setup copy of distributed matrix on CPU host */ hA = 0; Atmp = 0; ictxt = descA[CTXT_]; icontxt = ictxt; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myprow, &mypcol ); is_root = (myprow == 0) && (mypcol == 0); if ((idebug >= 1) && (is_root)) { printf("pcgetrf_gpu: m %d n %d ia %d ja %d \n", m,n, ia,ja ); }; ia_proc = Cindxg2p( ia, descA[MB_], myprow, descA[RSRC_], nprow); ja_proc = Cindxg2p( ja, descA[NB_], mypcol, descA[CSRC_], npcol); /* * setup global pivot vector */ lld = MIN(m,n) + descA[MB_]; nbytes = lld; nbytes *= sizeof(int); if (gipiv_ != 0) { free(gipiv_); gipiv_ = 0; }; gipiv_ = (int *) malloc( nbytes ); assert( gipiv_ != 0 ); desc_gipiv[DTYPE_] = descA[DTYPE_]; desc_gipiv[CTXT_] = descA[CTXT_]; desc_gipiv[M_] = MIN(m,n); desc_gipiv[N_] = 1; desc_gipiv[MB_] = desc_gipiv[M_]; desc_gipiv[NB_] = desc_gipiv[N_]; desc_gipiv[LLD_] = lld; desc_gipiv[RSRC_] = -1; desc_gipiv[CSRC_] = -1; /* * setup distribute array hA on host */ /* * Note, optimal block size on GPU might not be * optimal block size on CPU, but assume to be * the same for simplicity for now */ /* * should nnb = descA[NB_] * npcol ? */ nnb = descA[NB_]; minmn = MIN(m,n); for( jstart=1; jstart <= minmn; jstart = jend + 1) { jend = MIN( minmn, jstart + nnb - 1); jsize = jend - jstart + 1; /* * setup matrix on host */ /* was iia = (ia-1) + 1; */ j = jstart; jb = jsize; iia = (ia-1) + jstart; jja = (ja-1) + jstart; mm = m - jstart + 1; nn = jsize; if (use_setup_desc) { setup_desc( mm,nn, iia,jja,descA, &isize, desc_hA ); } else { irsrc = Cindxg2p( iia, descA[MB_], myprow, descA[RSRC_], nprow ); icsrc = Cindxg2p( jja, descA[NB_], mypcol, descA[CSRC_], npcol ); mb = descA[MB_]; nb = descA[NB_]; Locp = Cnumroc( mm, mb, 0,0,nprow ); Locq = Cnumroc( nn, nb, 0,0,npcol ); lld = MAX(1,Locp); isize = MAX(1,Locp) * MAX(1, Locq ); ictxt = descA[CTXT_]; iinfo = 0; Cdescinit( desc_hA, mm,nn, mb,nb, irsrc,icsrc, ictxt, lld, &iinfo); assert( iinfo == 0); }; nbytes = isize; nbytes *= elemSize; if (hA != 0) { free(hA); hA = 0; }; hA = (float *) malloc( nbytes ); assert( hA != 0 ); /* * distribution of pivot vector is tied to distribution of matrix */ Locp = Cnumroc( desc_hA[M_], desc_hA[MB_], myprow, desc_hA[RSRC_], nprow); lld = Locp + desc_hA[MB_]; nbytes = lld; nbytes *= sizeof(int); if (ipiv_hA_ != 0) { free( ipiv_hA_ ); ipiv_hA_ = 0; }; ipiv_hA_ = (int *) malloc( nbytes ); assert( ipiv_hA_ != 0); Cdescset( desc_ipiv_hA, desc_hA[M_], 1, desc_hA[MB_], 1, desc_hA[RSRC_], icsrc, desc_hA[CTXT_], lld ); /* copy column panel back to CPU host to be factored using scalapack */ jb = jsize; j = jstart; mm = m - j + 1; nn = jb; /* hA(1:mm,1:nn) <- dA(j:(j+mm-1), j:(j+nn-1) ) */ iia = (ia-1) + j; jja = (ja-1) + j; ii = 1; jj = 1; PROFSTART("gpu:hA <- dA"); Cpsgecopy_d2h( mm,nn, A,iia,jja,descA, hA, ii,jj, desc_hA ); PROFEND("gpu:hA <- dA"); /* * factor on host CPU using ScaLAPACK * Note the pivot vector is tied to the distribution of the matrix * Therefore, we need a different "ipiv_hA" pivot vector * that is tied the the distributed matrix hA */ ii = 1; jj = 1; iinfo = 0; mm_lu = mm; nn_lu = nn; ia_lu = ii; ja_lu = jj; PROFSTART("gpu:psgetrf"); scalapack_psgetrf( &mm_lu, &nn_lu, hA, &ia_lu, &ja_lu, desc_hA, &(ipiv_hA(1)), &iinfo ); PROFEND("gpu:psgetrf"); /* * broadcast pivot vector to global vector */ i1 = 1; j1 = 1; inc1 = 1; i2 = jstart; j2 = 1; inc2 = 1; mtmp = MIN(mm,nn); desc_ipiv_hA[CSRC_] = icsrc; use_replicated_storage = FALSE; if (use_replicated_storage) { int ja_lu_proc; ja_lu_proc = Cindxg2p(ja_lu,desc_hA[NB_], mypcol,desc_hA[CSRC_],npcol); desc_ipiv_hA[CSRC_] = ja_lu_proc; desc_gipiv[RSRC_] = -1; desc_gipiv[CSRC_] = -1; scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1, desc_ipiv_hA, &inc1, &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 ); } else { /* * copy to 1 processors (rsrc,csrc), then * broadcast to all processors */ int icontxt = desc_ipiv_hA[CTXT_]; char scope = 'A'; char top = ' '; int ntmp = 1; int lld; int ia_lu_proc,ja_lu_proc; int rsrc, csrc; ia_lu_proc = Cindxg2p( ia_lu, desc_hA[MB_], myprow,desc_hA[RSRC_],nprow); ja_lu_proc = Cindxg2p( ja_lu, desc_hA[NB_], mypcol,desc_hA[CSRC_],npcol); rsrc = ia_lu_proc; csrc = ja_lu_proc; desc_gipiv[RSRC_] = rsrc; desc_gipiv[CSRC_] = csrc; desc_ipiv_hA[CSRC_] = csrc; mtmp = MIN( mm_lu, nn_lu); scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1,desc_ipiv_hA,&inc1, &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 ); if ((myprow == rsrc) && (mypcol == csrc)) { lld = mtmp; ntmp = 1; scalapack_igebs2d( &icontxt, &scope, &top, &mtmp, &ntmp, &(gipiv(i2)), &lld ); } else { lld = mtmp; ntmp = 1; scalapack_igebr2d( &icontxt, &scope, &top, &mtmp, &ntmp, &(gipiv(i2)), &lld, &rsrc,&csrc ); }; }; if (idebug >= 1) { int desctmp[DLEN_]; char name_ipiv_hA[] = "ipiv_hA"; char name_gipiv[] = "gipiv"; if (is_root) { printf("jstart %d jend %d \n", jstart,jend); printf("mm_lu %d nn_lu %d ia_lu %d ja_lu %d\n", mm_lu, nn_lu, ia_lu, ja_lu ); }; Cdescset(desctmp, desc_hA[M_], npcol, desc_hA[MB_],1, desc_hA[RSRC_], desc_hA[CSRC_], desc_hA[CTXT_], desc_hA[LLD_] ); Cpilaprnt( MIN(mm_lu,nn_lu), npcol, &(ipiv_hA(1)), 1,1,desctmp, name_ipiv_hA); Cdescset(desctmp, minmn*nprow, npcol, minmn, 1, 0,0, descA[CTXT_], minmn ); Cpilaprnt( nprow*minmn, npcol, &(gipiv(1)),1,1,desctmp, name_gipiv); }; /* * adjust pivot sequence from 1:min(mm,nn) in ipiv to * jstart:(jstart+min(mm,nn)-1) */ for(int i=1; i <= MIN(mm,nn); i++) { i2 = (jstart-1) + i; gipiv(i2) = gipiv(i2) + (jstart-1); }; if (iinfo < 0) { *info = iinfo; return; }; if ((*info == 0) && (iinfo > 0)) { *info = iinfo + (j-1); return; }; /* * transfer factored panel back to GPU device */ iia = (ia-1) + j; jja = (ja-1) + j; ii = 1; jj = 1; PROFSTART("gpu:A <- hA"); Cpsgecopy_h2d(mm,nn, hA, ii,jj, desc_hA, A, iia,jja, descA ); PROFEND("gpu:A <- hA"); if (use_delayed_left_interchange) { /* * do nothing for now */ } else { /* * apply interchanges to columns 1:(j-1) */ nn = j-1; k1 = j; k2 = j + jb-1; incx = 1; PROFSTART("gpu:left swap"); if (nn >= 1) { iia = (ia-1) + 1; jja = (ja-1) + 1; for(kk=k1; kk <= k2; kk++) { ip = gipiv( kk); assert(ip >= kk ); assert( ip <= m ); if (kk != ip) { inc1 = descA[M_]; inc2 = descA[M_]; i1 = (iia-1) + kk; i2 = (iia-1) + ip; j1 = jja; j2 = jja; Cpsswap_gpu(nn, A,i1,j1,descA,inc1, A,i2,j2,descA,inc2 ); }; }; }; PROFEND("gpu:left swap"); }; /* * apply interchanges to columns (j+jb):n */ nn = n - (jend + 1) + 1; k1 = j; k2 = j + jb - 1; incx = 1; PROFSTART("gpu:right swap"); if (nn >= 1) { iia = (ia-1) + 1; jja = (ja-1) + (jend+1); for(kk=k1; kk <= k2; kk++) { ip = gipiv( kk ); assert( ip >= kk ); assert( ip <= m ); if (ip != kk) { i1 = (iia-1) + kk; i2 = (iia-1) + ip; j1 = jja; j2 = jja; inc1 = descA[M_]; inc2 = descA[M_]; Cpsswap_gpu( nn, A, i1,j1, descA, inc1, A, i2,j2, descA, inc2 ); }; }; }; PROFEND("gpu:right swap"); PROFSTART("gpu:pTRSM"); mm = jb; nn = n - (jend+1) + 1; if ( (1 <= mm) && (1 <= nn)) { /* cublasCtrsm('L','L','N','U', mm,nn, alpha, dA(j,j), lddA, dA(j,j+jb), lddA ); */ if (use_broadcast_triangular_matrix) { /* * broadcast triangular part, then solve locally */ char lscope = 'A'; char ltop = ' '; int msize, nsize, lr1,lc1, lr2,lc2; int ia_lu_proc, ja_lu_proc; /* * copy on local processor */ ia_lu_proc = Cindxg2p(ia_lu, desc_hA[MB_], myprow, desc_hA[RSRC_], nprow ); ja_lu_proc = Cindxg2p(ja_lu, desc_hA[NB_], mypcol, desc_hA[CSRC_], npcol ); /* * complete mm by mm block on Atmp */ ldAtmp = MAX(1,mm); Cdescset(desc_Atmp, mm,mm, mm,mm, ia_lu_proc,ja_lu_proc, icontxt, ldAtmp); isizeAtmp = ldAtmp * MAX(1,mm); nbytes = isizeAtmp; nbytes *= elemSize; if (Atmp != 0) { free(Atmp); Atmp = 0; }; Atmp = (float *) malloc( nbytes ); assert( Atmp != 0); #ifdef USE_CUBLASV2 { cudaError_t ierr; size_t isize = isizeAtmp; isize *= elemSize; ierr = cudaMalloc( (void **) &dAtmp, isize ); assert(ierr == cudaSuccess ); } #else cu_status = cublasAlloc(isizeAtmp, elemSize, (void **) &dAtmp ); CHKERR(cu_status); assert( dAtmp != 0); #endif ii = 1; jj = 1; scalapack_psgeadd( notrans, &mm, &mm, one, hA, &ia_lu, &ja_lu, desc_hA, zero, Atmp, &ii, &jj, desc_Atmp ); rsrc = desc_Atmp[RSRC_]; csrc = desc_Atmp[CSRC_]; if ((myprow == rsrc) && (mypcol == csrc)) { scalapack_cgebs2d( &icontxt, &lscope, <op, &mm, &mm, Atmp, &ldAtmp ); } else { scalapack_cgebr2d( &icontxt, &lscope, <op, &mm, &mm, Atmp, &ldAtmp, &rsrc, &csrc ); }; inc1 = 1; inc2 = 1; cu_status = cublasSetVector(isizeAtmp, elemSize, Atmp, inc1, dAtmp, inc2 ); CHKERR(cu_status); /* * perform local solve on GPU */ iia = (ia-1) + j; jja = (ja-1) + (j+jb); local_extent( mm,nn, iia,jja,descA, &msize,&nsize, &lr1,&lc1, &lr2,&lc2 ); if (msize >= 1) { assert( msize == mm ); }; if ((msize >= 1) && (nsize >= 1)) { char lside = 'L'; char luplo = 'L'; char ltrans = 'N'; char ldiag = 'U'; float zalpha; zalpha = (float)1.0;//make_float(1.0,0.0); CUBLAS_STRSM( ((lside == 'l')||(lside == 'L')) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT, ((luplo == 'l')||(luplo == 'L')) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER, ((ltrans == 'c')||(ltrans == 'C')) ? CUBLAS_OP_C : ((ltrans == 't')||(ltrans == 'T')) ? CUBLAS_OP_T : CUBLAS_OP_N, ((ldiag == 'u')||(ldiag == 'U')) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT, mm, nsize, zalpha, (float *) dAtmp, ldAtmp, dA(lr1,lc1), descA[LLD_] ); }; if (Atmp != 0) { free(Atmp); Atmp = 0; }; #ifdef USE_CUBLASV2 { cudaError_t ierr; ierr = cudaFree( (void *) dAtmp ); assert(ierr == cudaSuccess ); dAtmp = 0; } #else cu_status = cublasFree( dAtmp ); CHKERR(cu_status ); #endif } else { /* * perform triangular solve using scalapack */ iia = (ia-1) + j; jja = (ja-1) + (j+jb); setup_desc(mm,nn,iia,jja,descA, &isize, desc_Atmp ); nbytes = elemSize; nbytes *= isize; if (Atmp != 0) { free(Atmp); Atmp = 0; }; Atmp = (float *) malloc( nbytes ); assert( Atmp != 0 ); /* * copy to Atmp(1:mm,1:nn) <- dA(j:(j+mm-1),(j+jb):((j+jb)+nn-1)) */ ii = 1; jj = 1; PROFSTART("gpu:Atmp <- dA"); Cpsgecopy_d2h( mm,nn,A,iia,jja,descA, Atmp, ii,jj, desc_Atmp ); PROFEND("gpu:Atmp <- dA"); /* * perform triangular solve using scalapack */ side = left; uplo = lower; trans = notrans; diag = unit; alpha = one; iha = 1; jha = 1; ii = 1; jj = 1; PROFSTART("gpu:pstrsm") scalapack_pstrsm( side, uplo, trans, diag, &mm,&nn, alpha, hA, &iha,&jha, desc_hA, Atmp,&ii,&jj, desc_Atmp ); PROFEND("gpu:pstrsm") /* * copy back to GPU */ iia = (ia-1) + j; jja = (ja-1) + (j+jb); ii = 1; jj = 1; PROFSTART("gpu:A <- Atmp"); Cpsgecopy_h2d( mm,nn, Atmp,ii,jj,desc_Atmp, A, iia,jja, descA ); PROFEND("gpu:A <- Atmp"); }; }; PROFEND("gpu:pTRSM"); /* * update trailing submatrix */ alpha = neg_one; beta = one; mm = m-(jend+1) + 1; nn = n-(jend+1) + 1; kk = jb; if ((1 <= mm) && (1 <= nn) && (1 <= kk)) { /* cublasSgemm('N','N',mm,nn,kk, alpha, dA(j+jb,j),lddA, dA(j,j+jb),lddA, beta, dA(j+jb,j+jb), lddA ); */ if (use_broadcast_triangular_matrix) { /* * Copy from GPU to Atmp */ iia = (ia-1) + j; jja = (ja-1) + (j+jb); setup_desc( kk,nn, iia,jja, descA, &isizeAtmp, desc_Atmp); nbytes = isizeAtmp; nbytes *= elemSize; if (Atmp != 0) { free(Atmp); Atmp = 0; }; Atmp = (float *) malloc( nbytes ); assert( Atmp != 0); PROFSTART("gpu:Atmp <- A"); Cpsgecopy_d2h( kk,nn, A,iia,jja,descA, Atmp,1,1,desc_Atmp ); PROFEND("gpu:Atmp <- A"); }; iic = (ia-1) + (jend+1); jjc = (ja-1) + (jend+1); iha = jsize+1; jha = 1; iAtmp = 1; jAtmp = 1; { char transA = 'N'; char transB = 'N'; PROFSTART("zgetrf_gpu:psgemm"); Cpsgemm_hhd( transA, transB, mm,nn,kk, alpha, hA, iha,jha, desc_hA, Atmp, iAtmp,jAtmp, desc_Atmp, beta, A, iic,jjc, descA ); PROFEND("zgetrf_gpu:psgemm"); }; }; if (Atmp != 0) { free(Atmp); Atmp = 0; }; if (ipiv_hA_ != 0) { free( ipiv_hA_ ); ipiv_hA_ = 0; }; if (hA != 0) { free(hA); hA = 0; }; }; /* for (jstart) */ if (use_delayed_left_interchange) { PROFSTART("gpu:dleft swap"); for(j=1; j <= minmn; j = jend + 1) { jend = MIN( minmn, j+nnb-1); jsize = jend - j + 1; jb = jsize; /* * apply interchanges to columns 1:(j-1) */ nn = j-1; k1 = j; k2 = j+jb-1; incx = 1; if (nn >= 1) { iia = (ia-1) + 1; jja = (ja-1) + 1; for(kk=k1; kk <= k2; kk++) { ip = gipiv(kk); assert( ip >= kk ); if (ip != kk) { inc1 = descA[M_]; inc2 = descA[M_]; i1 = (iia-1) + kk; i2 = (iia-1) + ip; j1 = jja; j2 = jja; Cpsswap_gpu(nn, A, i1,j1,descA, inc1, A, i2,j2,descA, inc2 ); }; }; }; }; /* end for j */ PROFEND("gpu:dleft swap"); }; /* end if use delayed left interchange */ /* * adjust global pivot from 1:MIN(m,n) to ia:(ia + MIN(m,n)-1) * copy global vector back to distributed pivot vector */ for(int j=1; j <= minmn; j++) { gipiv(j) = (ia-1) + gipiv(j); }; lld = descA[MB_] + Cnumroc( descA[M_], descA[MB_], myprow, descA[RSRC_], nprow); Cdescset( desc_ipiv, descA[M_],1, descA[MB_], 1, descA[RSRC_], -1, descA[CTXT_], lld ); i1 = 1; j1 = 1; inc1 = 1; i2 = ia; j2 = 1; inc2 = 1; mtmp = MIN(m,n); PROFSTART("gpu:ipiv"); use_replicated_storage = FALSE; if (use_replicated_storage) { int msize,nsize,lr1,lc1,lr2,lc2, lrindx,iia; local_extent(MIN(m,n),n,ia,ja,descA, &msize,&nsize, &lr1,&lc1, &lr2,&lc2); if (msize >= 1) { for(lrindx=lr1; lrindx <= lr2; lrindx++) { iia = Cindxl2g( lrindx, descA[MB_], myprow, descA[RSRC_], nprow); ipiv(lrindx) = gipiv( (iia-ia) + 1 ); }; }; } else { /* * copy to a column, then broadcast */ char scope = 'R'; char top = ' '; int Locp, Locq; int lld; int icontxt = desc_ipiv[CTXT_]; desc_ipiv[CSRC_] = ja_proc; desc_gipiv[RSRC_] = ia_proc; desc_gipiv[CSRC_] = ja_proc; mtmp = MIN(m,n); scalapack_picopy( &mtmp, &(gipiv(1)), &i1,&j1, desc_gipiv, &inc1, &(ipiv(1)), &i2, &j2, desc_ipiv, &inc2 ); if (idebug >= 1) { char cmatnm[] = "ipiv after picopy"; if (is_root) { printf("ia_proc %d ja_proc %d i2 %d j2 %d \n",ia_proc,ja_proc,i2,j2); }; Cpilaprnt( mtmp,1, &(ipiv(1)), i2,j2,desc_ipiv, cmatnm); }; Locp = Cnumroc( ia + MIN(m,n)-1, desc_ipiv[MB_], myprow, desc_ipiv[RSRC_], nprow); lld = MAX(1,Locp); Locq = 1; if (npcol > 1) { if (mypcol == ja_proc) { scalapack_igebs2d( &icontxt, &scope, &top, &Locp, &Locq, &(ipiv(1)), &lld ); } else { rsrc = myprow; scalapack_igebr2d( &icontxt, &scope, &top, &Locp, &Locq, &(ipiv(1)), &lld, &rsrc, &ja_proc ); }; }; }; PROFEND("gpu:ipiv"); if (idebug >= 1) { int desctmp[DLEN_]; char cmatnm[] = "final ipiv"; Cdescset( desctmp, descA[M_],npcol, descA[MB_],1, descA[RSRC_], descA[CSRC_], descA[CTXT_], descA[LLD_]); Cpilaprnt( MIN(m,n),npcol, &(ipiv(1)), ia,1,desctmp, cmatnm); }; /* * clean up */ if (Atmp != 0) { free(Atmp); Atmp = 0; }; if (hA != 0) { free(hA); hA = 0; }; if (ipiv_hA_ != 0) { free( ipiv_hA_ ); ipiv_hA_ = 0; }; if (gipiv_ != 0) { free(gipiv_); gipiv_ = 0; }; return; }
void Cpcswap_gpu( int n, cuComplex *A, int ia,int ja,int *descA, int incA, cuComplex *B, int ib,int jb,int *descB, int incB ) { /* perform pcswap operation when both distributed arrays A and B are in device memory */ /* * allocate temporary space on host * then use pcswap for communication */ const int use_MallocHost = FALSE; cublasStatus cu_status; size_t nbytes; int elemSize = sizeof( cuComplex ); float *Atmp = 0; float *Btmp = 0; int descAtmp[DLEN_]; int descBtmp[DLEN_]; int ldA, ldB, ldAtmp, ldBtmp; int nprow,npcol,myprow,mypcol; int Locp, Locq, lrindx, lcindx, mm,nn; int LocpA, LocqA, lrindxA, lcindxA; int LocpB, LocqB, lrindxB, lcindxB; int isizeA, isizeB, rsrc, csrc; int iia,jja, iib, jjb; int incAtmp, incBtmp; int lrA1,lcA1, lrA2,lcA2; int lrB1,lcB1, lrB2,lcB2; Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol ); /* * allocate storage for vector from A */ if (incA == 1) { /* * This is a column vector */ mm = n; nn = 1; } else { /* * This is a row vector */ mm = 1; nn = n; }; setup_desc( mm,nn, ia,ja, descA, &isizeA, descAtmp ); nbytes = elemSize; nbytes *= isizeA; if (use_MallocHost) { Atmp = (float *) MallocHost( nbytes ); } else { Atmp = (float *) malloc( nbytes ); }; assert( Atmp != 0 ); /* * copy vector from A */ PROFSTART("swap:GetMatrix"); local_extent( mm,nn,ia,ja,descA, &LocpA, &LocqA, &lrA1,&lcA1, &lrA2,&lcA2 ); lrindxA = lrA1; lcindxA = lcA1; ldA = descA[LLD_]; ldAtmp = descAtmp[LLD_]; if ( (LocpA >= 1) && (LocqA >= 1)) { /* * copy from GPU device to host CPU */ cu_status = cublasGetMatrix( LocpA,LocqA, elemSize, dA(lrindxA,lcindxA), ldA, Atmp, ldAtmp ); CHKERR(cu_status); }; /* * allocate storage for vector from B */ Cblacs_gridinfo( descB[CTXT_], &nprow, &npcol, &myprow, &mypcol ); if (incB == 1) { /* * This is a column vector */ mm = n; nn = 1; } else { /* * This is a row vector */ mm = 1; nn = n; }; setup_desc( mm,nn, ib,jb,descB, &isizeB, descBtmp ); ldBtmp = descBtmp[LLD_]; ldB = descB[LLD_]; nbytes = elemSize; nbytes *= isizeB; if (use_MallocHost) { Btmp = (float *) MallocHost( nbytes ); } else { Btmp = (float *) malloc( nbytes ); }; assert( Btmp != 0 ); /* * copy vector from B */ local_extent( mm,nn,ib,jb,descB, &LocpB, &LocqB, &lrB1,&lcB1, &lrB2,&lcB2 ); lrindxB = lrB1; lcindxB = lcB1; ldB = descB[LLD_]; ldBtmp = descBtmp[LLD_]; if ((LocpB >= 1) && (LocqB >= 1)) { /* * Copy from GPU to CPU host */ cu_status = cublasGetMatrix(LocpB,LocqB,elemSize, dB(lrindxB,lcindxB), ldB, Btmp, ldBtmp ); CHKERR(cu_status ); }; PROFEND("swap:GetMatrix"); iia = 1; jja = 1; iib = 1; jjb = 1; if (incA == 1) { incAtmp = 1; } else { incAtmp = descAtmp[M_]; }; if (incB == 1) { incBtmp = 1; } else { incBtmp = descBtmp[M_]; }; PROFSTART("swap:pcswap"); scalapack_pcswap( &n, Atmp, &iia, &jja, descAtmp, &incAtmp, Btmp, &iib, &jjb, descBtmp, &incBtmp ); PROFEND("swap:pcswap"); /* * copy from host CPU back to GPU */ PROFSTART("swap:SetMatrix"); if ((LocpA >= 1) && (LocqA >= 1)) { /* * Copy from CPU host to GPU device */ cu_status = cublasSetMatrix( LocpA, LocqA, elemSize, Atmp, ldAtmp, dA(lrindxA,lcindxA), ldA ); CHKERR(cu_status); }; if ((LocpB >= 1) && (LocqB >= 1)) { /* * Copy from CPU host to GPU device */ cu_status = cublasSetMatrix( LocpB, LocqB, elemSize, Btmp, ldBtmp, dB(lrindxB,lcindxB), ldB ); CHKERR(cu_status); }; PROFEND("swap:SetMatrix"); /* * clean up */ if (Atmp != 0) { if (use_MallocHost) { FreeHost(Atmp); } else { free(Atmp); }; Atmp = 0; }; if (Btmp != 0) { if (use_MallocHost) { FreeHost(Btmp); } else { free(Btmp); }; Btmp = 0; }; return; }
static void tintin(void) { int i, result, maxfd; struct timeval tv; fd_set readfdmask; #ifdef XTERM_TITLE struct session *lastsession=0; #endif char kbdbuf[BUFFER_SIZE]; WC ch; int inbuf=0; mbstate_t instate; memset(&instate, 0, sizeof(instate)); for (;;) { #ifdef XTERM_TITLE if (ui_own_output && activesession!=lastsession) { lastsession=activesession; if (activesession==nullsession) user_title(XTERM_TITLE, "(no session)"); else user_title(XTERM_TITLE, activesession->name); } #endif tv.tv_sec = check_events(); tv.tv_usec = 0; maxfd=0; FD_ZERO(&readfdmask); if (!eofinput) FD_SET(0, &readfdmask); else if (activesession==nullsession) end_command(0, activesession); for (struct session *ses = sessionlist; ses; ses = ses->next) { if (ses==nullsession) continue; if (ses->nagle) flush_socket(ses); FD_SET(ses->socket, &readfdmask); if (ses->socket>maxfd) maxfd=ses->socket; } result = select(maxfd+1, &readfdmask, 0, 0, &tv); if (need_resize) { char buf[BUFFER_SIZE]; user_resize(); sprintf(buf, "#NEW SCREEN SIZE: %dx%d.", COLS, LINES); tintin_puts1(buf, activesession); } if (result == 0) continue; else if (result < 0 && errno == EINTR) continue; /* Interrupted system call */ else if (result < 0) syserr("select"); if (FD_ISSET(0, &readfdmask)) { PROFSTART; PROFPUSH("user interface"); result=read(0, kbdbuf+inbuf, BUFFER_SIZE-inbuf); if (result==-1) myquitsig(0); if (result==0 && !isatty(0)) eofinput=true; inbuf+=result; i=0; while (i<inbuf) { result=mbrtowc(&ch, kbdbuf+i, inbuf-i, &instate); if (result==-2) /* incomplete but valid sequence */ { memmove(kbdbuf, kbdbuf+i, inbuf-i); inbuf-=i; goto partial; } else if (result==-1) /* invalid sequence */ { ch=0xFFFD; i++; errno=0; /* Shift by 1 byte. We can use a more intelligent shift, * but staying charset-agnostic makes the code simpler. */ } else if (result==0) /* literal 0 */ i++; /* oops... bad ISO/ANSI, bad */ else i+=result; if (user_process_kbd(activesession, ch)) { hist_num=-1; if (term_echoing || (got_more_kludge && done_input[0])) /* got_more_kludge: echo any non-empty line */ { if (activesession && *done_input) if (strcmp(done_input, prev_command)) do_history(done_input, activesession); if (activesession->echo) echo_input(done_input); if (activesession->logfile) write_logf(activesession, done_input, activesession->loginputprefix, activesession->loginputsuffix); } if (*done_input) strcpy(prev_command, done_input); aborting=false; activesession = parse_input(done_input, false, activesession); recursion=0; } } inbuf=0; partial: PROFEND(kbd_lag, kbd_cnt); PROFPOP; } for (struct session *ses = sessionlist; ses; ses = ses->next) { if (ses->socket && FD_ISSET(ses->socket, &readfdmask)) { aborting=false; any_closed=false; do { read_mud(ses); if (any_closed) { any_closed=false; goto after_read; /* The remaining sessions will be done after select() */ } #ifdef HAVE_ZLIB } while (ses->mccp_more); #else } while (0); #endif } } after_read: if (activesession->server_echo && (2-activesession->server_echo != gotpassword)) { gotpassword= 2-activesession->server_echo; if (!gotpassword) got_more_kludge=false; user_passwd(gotpassword && !got_more_kludge); term_echoing=!gotpassword; } }