uint32_t ADI_FindContours( uint8_t *pImage, uint32_t nWidth, uint32_t nHeight )
{
    uint32_t                        i, pixels;
    uint32_t                        j, CONTOURCount;
    uint32_t                        nNumSegments;
    ADI_ITB_MODULE_STATUS           nReturnValue;
    uint32_t                        index,nColour;

    ADI_CONTOUR_DYNAMIC_MEM_ALLOC   *pDMA;
    ADI_CONT_SEGMENT_HDR            *pSegmentListHdr;
    ADI_CONT_SEGMENT_HDR            *pSegmentListTempHdr;
    ADI_CONT_SEGMENT_HDR_ROW_WISE   *pSegmentListRowWiseHdr;

    //-------------------------------------------------------------------------------------
    // ADI BF60x Dice-Counting Demo Developed by Berkeley Design Technology, Inc. (BDTI)
    // ------------------------------------------------------------------------------------
    uint32_t                        nArea, nBoxArea, nSymetry, nRatio;

    pDMA = &oDMA;
    pSegmentListHdr = NULL;
    pSegmentListTempHdr = NULL;
    pSegmentListRowWiseHdr = NULL;
    CONTOURCount = 0;
    nTotalRLENodes = 0;
    index =0;
    oCONTOURInfo. nNumberOfObjects =0;
#ifdef PROFILE
    goProfileFunc.nSum = 0;
    PROFBEG(goProfileFunc);
#endif

    if( InitMemory )
    {
        nReturnValue = adi_DynamicMemInit(pDMA,
                                          aTempBuffer,
                                          TEMP_BUFFER_SIZE,
                                          nHeight*4);
        if ((nReturnValue & 0xFFFF) != (ADI_ITB_STATUS_SUCCESS))
        {
            printf("adi_DynamicMemInit - Not enough memory\n");
            //InitMemory = true;
            return (0);
        }
        pRunLenListHdr = (ADI_CONT_RUN_LEN_HDR *)pDMA->pMemAlloc(pDMA,
                                                                 ADI_CONTOUR_RUN_LEN_HDR,
                                                                 nHeight);
        if (pRunLenListHdr == NULL)
        {
            printf("pRunLenListHdr - Not enough memory\n");
            //InitMemory = true;
            return (0);
        }

        pBoundingRectangle = (ADI_CONT_BOUNDING_RECTANGLE *)pDMA->pMemAlloc(pDMA,
                                                                            ADI_CONTOUR_BOUNDING_RECT,
                                                                            1);
        if (pBoundingRectangle == NULL)
        {
            printf("pBoundingRectangle - Not enough memory\n");
            //InitMemory = true;
            return (0);
        }

        InitMemory = false;
    }

#ifdef PROFILE
    PROFEND(goProfileFunc);
    printf("adi_DynamicMemInit time(ms) = %d\n", goProfileFunc.nSum/500000);
#endif
#ifdef PROFILE
    goProfileFunc.nSum = 0;
    PROFBEG(goProfileFunc);
#endif


#ifdef PROFILE
    PROFEND(goProfileFunc);
    printf("Allocating memory time(ms) = %d\n", goProfileFunc.nSum/500000);
#endif

#ifdef PROFILE
    goProfileFunc.nSum = 0;
    PROFBEG(goProfileFunc);
#endif

#if 1
        nReturnValue = ADIContoursRLEWrapper(     pDMA,
                                                (int8_t *)pImage,
                                                nWidth*nHeight,
                                                pRunLenListHdr,
                                                nWidth,
                                                nHeight );
#else // L3
        for (j = 0; j < nHeight; j++)
                {
                    pRunLenListHdr[j].pNext = NULL;
                    nReturnValue = adi_contour_RLE(pDMA,
                                                   (uint8_t *)pImage + j * nWidth,
                                                   nWidth,
                                                   j,
                                                   &pRunLenListHdr[j]);
                }

#endif

        if ((nReturnValue & 0xFFFF) != (ADI_ITB_STATUS_SUCCESS))
        {
             if ((nReturnValue & 0xFFFF) == (ADI_ITB_STATUS_CONT_NO_RUN_LEN))
                        {
                            printf("Not enough memory\n");
                            printf("\tNot enough ADI_CONT_RUN_LEN_NODE nodes created.\nEdit MAX_NUM_RUN_LEN_NODE in adi_contour_mem_alloc.c to increase number of nodes created\n");
                        }
                        else
                        {
                            printf("Error!!!\n");
                        }
                        pDMA->pMemReset(pDMA,
                                        NULL,
                                        ADI_CONTOUR_RUN_LEN_NODE,
                                        0);
                        return (0);
        }


#ifdef PROFILE
        PROFEND(goProfileFunc);
        printf("adi_contour_RLE time(ms) = %d\n", goProfileFunc.nSum/500000);
#endif
#ifdef PROFILE
    goProfileFunc.nSum = 0;
    PROFBEG(goProfileFunc);
#endif
        nReturnValue = adi_contour_segment_image(pDMA,
                                                 pRunLenListHdr,
                                                 nHeight,
                                                 &pSegmentListHdr,
                                                 &nNumSegments);
        if ((nReturnValue & 0xFFFF) != (ADI_ITB_STATUS_SUCCESS))
                {
                    if ((nReturnValue & 0xFFFF) == (ADI_ITB_STATUS_CONT_NO_SEG_HDR))
                    {
                        printf("Not enough memory\n");
                        printf("\tNot enough ADI_CONTOUR_SEG_HDR nodes created.\nEdit MAX_NUM_SEGMENT_HDR in adi_contour_mem_alloc.c to increase number of nodes created\n");
                    }
                    else if ((nReturnValue & 0xFFFF) == (ADI_ITB_STATUS_CONT_NO_EQU_HDR))
                    {
                        printf("Not enough memory\n");
                        printf("\tNot enough ADI_CONTOUR_EQU_SEG_HDR nodes created.\nEdit MAX_NUM_EQU_SEGMENT_HDR in adi_contour_mem_alloc.c to increase number of nodes created\n");
                    }
                    else if ((nReturnValue & 0xFFFF) == (ADI_ITB_STATUS_CONT_NO_EQUNODE))
                    {
                        printf("Not enough memory\n");
                        printf("\tNot enough ADI_CONTOUR_EQU_SEG_NODE nodes created.\nEdit MAX_NUM_EQU_SEGMENT_NODE in adi_contour_mem_alloc.c to increase number of nodes created\n");
                    }
                    else
                    {
                        printf("Error!!!\n");
                    }
                    pDMA->pMemReset(pDMA,
                                    NULL,
                                    ADI_CONTOUR_RUN_LEN_NODE,
                                    0);
                    pDMA->pMemReset(pDMA,
                                    NULL,
                                    ADI_CONTOUR_SEG_HDR,
                                    0);
                    pSegmentListHdr = NULL;
                    pDMA->pMemReset(pDMA,
                                    NULL,
                                    ADI_CONTOUR_EQU_SEG_HDR,
                                    0);
                    return (0);
                }

#ifdef PROFILE
        PROFEND(goProfileFunc);
        printf("adi_contour_segment_image time(ms) = %d\n", goProfileFunc.nSum/500000);
#endif


        for(pSegmentListTempHdr = pSegmentListHdr ;pSegmentListTempHdr;  )
        {
#ifdef PROFILE
    goProfileFunc.nSum = 0;
    PROFBEG(goProfileFunc);
#endif
            adi_contour_BoundingRectangle(pSegmentListTempHdr,
                                          pBoundingRectangle);
#ifdef PROFILE
        PROFEND(goProfileFunc);
        printf("Bounding Rectange time(ms) = %d\n", goProfileFunc.nSum/500000);
#endif
#ifdef PROFILE
    goProfileFunc.nSum = 0;
    PROFBEG(goProfileFunc);
#endif
            pSegmentListRowWiseHdr = (ADI_CONT_SEGMENT_HDR_ROW_WISE *)pDMA->pMemAlloc(pDMA,
                                                              ADI_CONTOUR_SEG_HDR_ROW_WISE,
                                                              (pBoundingRectangle->nHeight));
            if (pSegmentListRowWiseHdr == NULL)
            {
                printf("pSegmentListRowWiseHdr - Not enough memory\n");
                pDMA->pMemFree(pDMA,
                               pSegmentListRowWiseHdr,
                               ADI_CONTOUR_SEG_HDR_ROW_WISE,
                               (pBoundingRectangle->nHeight));
                return (0XFFFFFFFF);
            }


            adi_contour_SplitSegmentRowWise(pSegmentListTempHdr,
                                            pSegmentListRowWiseHdr,
                                            pBoundingRectangle);

#ifdef PROFILE
        PROFEND(goProfileFunc);
        printf("SplitSegmentRowWise+adi_ContourArea time(ms) = %d\n", goProfileFunc.nSum/500000);
#endif

//-------------------------------------------------------------------------------------------------
//
// Insert Contour classifier code here.
//
// Conceptually similar to boundingRect() in OpenCV
//     	adi_contour_BoundingRectangle() returns the bounding rectangle of a contour.
//		The dimensions are in pixels.
// pBoundingRectangle->nHeight = Height of the rectangle bounding the contour
// pBoundingRectangle->nWidth  = Width of the rectangle bounding the contour
//
// Conceptually similar to contourArea() in OpenCV
//		adi_ContourArea() returns the area of a contour.
//		The area returned is in pixels^2
//
// oDotInfo is a structure used by the graphics module to display bounding boxes
//		see Graphics.c
//		nColour is the bonding box color in ARGB format (0x7FRRGGBB)
//-------------------------------------------------------------------------------------------------

        //-------------------------------------------------------------------------------------
        // ADI BF60x Dice-Counting Demo Developed by Berkeley Design Technology, Inc. (BDTI)
        // ------------------------------------------------------------------------------------
        nArea = adi_ContourArea( pSegmentListRowWiseHdr, pBoundingRectangle->nHeight );
        nBoxArea = pBoundingRectangle->nHeight*pBoundingRectangle->nWidth;
        nSymetry = (10*pBoundingRectangle->nHeight)/pBoundingRectangle->nWidth;

        //---------------------------------------------------------------------------------------------
        //
        // 			BDTI Full Dice Dot Classifier Cascade - 02/18/2013
        //
        //			1st level - Filter out circles that are very big or very small
        //			2nd level - Ratio of calculated area of inscribed circle to measured contour area
        //						Calculated area of inscribed circle = (((contour bounding box area)*PI)/4)
        //						Measured area of arbitrary contour.
        //						Ratio = ((nBoxArea*314)/4)/nArea = 100*((Box Area *PI)/(4*Circle area))
        //			3rd level - Symmetry of contour (bounding box height / bounding box width).
        //						(10*pBoundingRectangle->nHeight)/pBoundingRectangle->nWidth;
        //
        //			If the contour passes through all three levels, it is classified as a Dice Dot and counted.
        //
        //---------------------------------------------------------------------------------------------
        nColour = 0x7F000000; // Black - Used for Demo Mode 1
        if( (nArea >= CIRCLEAREAMIN) && (nArea <= CIRCLEAREAMAX ) )
        {
        	nRatio = (nBoxArea*314)/4;
        	nRatio /= nArea;  // nRatio = ((Box Area * 314)/4)/nArea = 100*((Box Area *PI)/(4*Circle area))

        	nColour = 0x7FFF0000; // RED - Used for Demo Mode 1

        	//adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, nBoxArea/10 );
        	//adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, nArea/10 );
        	//adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, nRatio );
        	if( (nRatio >= MIN_RATIO) && (nRatio <= MAX_RATIO) )
        	{
        		nColour = 0x7F0000FF; // Blue - Used for Demo Mode 1

        		//adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, nSymetry ); //nSymetry );
        		if( (nSymetry >= MIN_SYMETRY) && (nSymetry <= MAX_SYMETRY) )
        		{
        			adi_FillArea( pImage, nWidth, pSegmentListRowWiseHdr, pBoundingRectangle->nHeight, 3 );

        			nColour = 0X7F00FF00; // Green - Used for Demo Mode 1
        			CONTOURCount++;
        		}
        	}
        }
        //---------------------------------------------------------------------------------------------

		// Limit viewable bounding rectangles to 1.5 times MAX box area
        if( (nBoxArea <= ((BOXAREAMAX*3)/2) )  )
        {
        	nColour = 0x7F000000 + (nArea&0x00FFFFFF);

        	if(index < MAX_OBJ_INFO)
        	{
        		oCONTOURInfo.aCONTOURInfo[index].nHeight       = pBoundingRectangle->nHeight;
        		oCONTOURInfo.aCONTOURInfo[index].nWidth        = pBoundingRectangle->nWidth;
        		oCONTOURInfo.aCONTOURInfo[index].nXBottomRight = pBoundingRectangle->nXBottomRight;
        		oCONTOURInfo.aCONTOURInfo[index].nXTopLeft     = pBoundingRectangle->nXTopLeft;
        		oCONTOURInfo.aCONTOURInfo[index].nYBottomRight = pBoundingRectangle->nYBottomRight;
        		oCONTOURInfo.aCONTOURInfo[index].nYTopLeft     = pBoundingRectangle->nYTopLeft;
        		oCONTOURInfo.nColour[index++]  = nColour;
        	}
        }

#ifdef PROFILE
    goProfileFunc.nSum = 0;
    PROFBEG(goProfileFunc);
#endif

            pDMA->pMemFree(pDMA,
                            pSegmentListRowWiseHdr,
                            ADI_CONTOUR_SEG_HDR_ROW_WISE,
                            (pBoundingRectangle->nHeight));
            pSegmentListRowWiseHdr = NULL;

#ifdef PROFILE
        PROFEND(goProfileFunc);
        printf("inside for memfree memreset time(ms) = %d\n", goProfileFunc.nSum/500000);
#endif

            pSegmentListTempHdr = pSegmentListTempHdr->pDown;
        } // while

//----------------------------------------------------------------------------------------

#ifdef PROFILE
    goProfileFunc.nSum = 0;
    PROFBEG(goProfileFunc);
#endif

    pDMA->pMemReset(pDMA,
                    pSegmentListHdr,
                    ADI_CONTOUR_SEG_HDR,
                    0);

    pDMA->pMemReset(pDMA,
                    pRunLenListHdr,
                    ADI_CONTOUR_RUN_LEN_NODE,
                    nHeight);
    pSegmentListHdr = NULL;

    pDMA->pMemReset(pDMA,
                    NULL,
                    ADI_CONTOUR_EQU_SEG_HDR,
                    0);

#ifdef PROFILE
    PROFEND(goProfileFunc);
    printf("Memory Cleanup time(ms) = %d\n", goProfileFunc.nSum/500000);
#endif
    oCONTOURInfo. nNumberOfObjects = index;
    oCONTOURInfo.nCONTOURCount = CONTOURCount;
    return (CONTOURCount);
}
void psgetrf_gpu(int *m_in, int *n_in, 
   float *A, int *ia_in, int *ja_in, int *descA, 
   int *ipiv_, int *info)
{

int m = *m_in;
int n = *n_in;
int ia = *ia_in;
int ja = *ja_in;

const int use_setup_desc = TRUE;
const int idebug = 0;
int use_replicated_storage = FALSE;
const int use_broadcast_triangular_matrix = TRUE;

int ia_proc, ja_proc;
int lrindx, lcindx, rsrc,csrc, irsrc,icsrc;
int ictxt, nprow,npcol, myprow,mypcol;
int is_root;

int minmn;
int k1,k2,incx,ip;
int mm, nn, kk, ii, jj, mtmp;
int mm_lu,nn_lu,ia_lu,ja_lu;

int elemSize = sizeof( float );
size_t nbytes;

int nnb, jstart,jend,jsize, isize, jb;
int icontxt, isizeAtmp;


int i,j, iia,jja, ldA, ldhA;
int iinfo = 0;
int iAtmp, jAtmp, iha,jha, iib,jjb,iic,jjc;
int ldAtmp, ldBtmp, lmm,lnn;
int lrA1,lcA1, lrA2,lcA2;

int desc_hA_[DLEN_];
int *desc_hA = &(desc_hA_[0]);

int *ipiv_hA_ = 0;
float *hA = 0;
float *Atmp = 0;
float *dAtmp = 0;

int *gipiv_ = 0;
int desc_Atmp_[DLEN_];
int *desc_Atmp = &(desc_Atmp_[0]);
cublasStatus cu_status;

int isok;
int use_delayed_left_interchange = 1;

int is_mine;
int i1,j1,inc1,  i2,j2,inc2;
int desc_ipiv_hA_[DLEN_];
int *desc_ipiv_hA = &(desc_ipiv_hA_[0]);

int desc_ipiv_[DLEN_];
int *desc_ipiv = &(desc_ipiv_[0]);

int desc_gipiv_[DLEN_];
int *desc_gipiv = &(desc_gipiv_[0]);
int mb,nb, Locp, Locq, lld;


char direc = 'F';
char rowcol = 'R';

char left[] = "Left";
char lower[] = "Lower";
char notrans[] = "NoTrans";
char unit[] = "Unit";

char *side = left;
char *uplo = lower;
char *trans = notrans;
char *diag = unit;

float zero_[REAL_PART+IMAG_PART+1];
float *zero = &(zero_[0]);

float one_[REAL_PART+IMAG_PART+1];
float *one = &(one_[0]);

float neg_one_[REAL_PART+IMAG_PART+1];
float *neg_one = &(neg_one_[0]);

float beta_[REAL_PART+IMAG_PART+1];
float *beta = &(beta_[0]);

float alpha_[REAL_PART+IMAG_PART+1];
float *alpha = &(alpha_[0]);
/*
 * A is a pointer to GPU device memory but conceptually associated
 * with a scalapack distributed matrix 

 * A is array of complex numbers
 */



*info = 0;

zero[REAL_PART] = 0.0;
zero[IMAG_PART] = 0.0;
one[REAL_PART] = 1.0;
one[IMAG_PART] = 0.0;
neg_one[REAL_PART] = -1.0;
neg_one[IMAG_PART] = 0.0;


/*
 * setup copy of distributed matrix on CPU host
 */

hA = 0;
Atmp = 0;

ictxt = descA[CTXT_];
icontxt = ictxt;

Cblacs_gridinfo( ictxt, &nprow, &npcol,  &myprow, &mypcol );
is_root = (myprow == 0) && (mypcol == 0);
if ((idebug >= 1) && (is_root)) {
  printf("pcgetrf_gpu: m %d n %d ia %d ja %d \n",
      m,n,   ia,ja );
};


ia_proc = Cindxg2p( ia, descA[MB_], myprow, descA[RSRC_], nprow);
ja_proc = Cindxg2p( ja, descA[NB_], mypcol, descA[CSRC_], npcol);


/*
 * setup global pivot vector
 */
lld = MIN(m,n) + descA[MB_];
nbytes = lld;
nbytes *= sizeof(int);
if (gipiv_ != 0) {
  free(gipiv_); gipiv_ = 0;
};
gipiv_ = (int *) malloc( nbytes );
assert( gipiv_ != 0 );


desc_gipiv[DTYPE_] = descA[DTYPE_];
desc_gipiv[CTXT_] = descA[CTXT_];
desc_gipiv[M_] = MIN(m,n);
desc_gipiv[N_] = 1;
desc_gipiv[MB_] = desc_gipiv[M_];
desc_gipiv[NB_] = desc_gipiv[N_];
desc_gipiv[LLD_] = lld;

desc_gipiv[RSRC_] = -1;
desc_gipiv[CSRC_] = -1;

  /*
   * setup distribute array hA on host
   */

/*
 * Note, optimal block size on GPU might not be
 * optimal block size on CPU, but assume to be
 * the same for simplicity for now
 */

/*
 * should nnb = descA[NB_] * npcol  ?
 */
nnb = descA[NB_];

minmn = MIN(m,n);
for( jstart=1; jstart <= minmn; jstart = jend + 1) {
  jend = MIN( minmn, jstart + nnb - 1);
  jsize = jend - jstart + 1;

  /*
   * setup matrix on host
   */

  /*
  was iia = (ia-1) + 1;
  */
  j = jstart;
  jb = jsize;

  iia = (ia-1) + jstart;
  jja = (ja-1) + jstart;
  mm = m - jstart + 1;
  nn = jsize;

  if (use_setup_desc) {
    setup_desc( mm,nn, iia,jja,descA, &isize, desc_hA );
    }
  else {
    irsrc = Cindxg2p( iia, descA[MB_], myprow, descA[RSRC_], nprow );
    icsrc = Cindxg2p( jja, descA[NB_], mypcol, descA[CSRC_], npcol );
  
    mb = descA[MB_];
    nb = descA[NB_];
    Locp = Cnumroc( mm, mb, 0,0,nprow );
    Locq = Cnumroc( nn, nb, 0,0,npcol );
    lld = MAX(1,Locp);
    isize = MAX(1,Locp) * MAX(1, Locq );
  
    ictxt = descA[CTXT_];
    iinfo = 0;
    Cdescinit( desc_hA, mm,nn,  mb,nb,  irsrc,icsrc, ictxt, lld, &iinfo);
    assert( iinfo == 0);
    };


  nbytes = isize;
  nbytes *= elemSize;
  if (hA != 0) { 
    free(hA); hA = 0;
  };
  hA = (float *) malloc( nbytes );
  assert( hA != 0 );

  /*
   * distribution of pivot vector is tied to distribution of matrix
   */
  Locp = Cnumroc( desc_hA[M_], desc_hA[MB_], myprow, desc_hA[RSRC_], nprow);
  lld = Locp + desc_hA[MB_];
  nbytes = lld;
  nbytes *= sizeof(int);
  if (ipiv_hA_ != 0) {
    free( ipiv_hA_ ); ipiv_hA_ = 0;
  };
  ipiv_hA_ = (int *) malloc( nbytes );
  assert( ipiv_hA_ != 0);

  Cdescset( desc_ipiv_hA, desc_hA[M_],  1,
              desc_hA[MB_], 1,
              desc_hA[RSRC_], icsrc,
              desc_hA[CTXT_], 
              lld );





  /*
   copy column panel back to CPU host
   to be factored using scalapack
   */ 

  jb = jsize;
  j = jstart;
  mm = m  - j + 1;
  nn = jb;



  /*
    hA(1:mm,1:nn) <-  dA(j:(j+mm-1), j:(j+nn-1) )
   */


  iia = (ia-1) + j;
  jja = (ja-1) + j;
  ii = 1;
  jj = 1;

  PROFSTART("gpu:hA <- dA");
  Cpsgecopy_d2h( mm,nn, A,iia,jja,descA,  hA, ii,jj, desc_hA );
  PROFEND("gpu:hA <- dA");



  /*
   * factor on host CPU using ScaLAPACK
   * Note the pivot vector is tied to the distribution of the matrix
   * Therefore, we need a different "ipiv_hA" pivot vector
   * that is tied the the distributed matrix hA
   */

  ii = 1;
  jj = 1;
  iinfo = 0;
  mm_lu = mm;
  nn_lu = nn;
  ia_lu = ii;
  ja_lu = jj;

  PROFSTART("gpu:psgetrf");
  scalapack_psgetrf( &mm_lu, &nn_lu, 
        hA, &ia_lu, &ja_lu,  desc_hA, &(ipiv_hA(1)), &iinfo );
  PROFEND("gpu:psgetrf");

  /*
   * broadcast pivot vector to global vector
   */



  i1 = 1;
  j1 = 1;
  inc1 = 1;

  i2 = jstart;
  j2 = 1;
  inc2 = 1;
  mtmp = MIN(mm,nn);
  desc_ipiv_hA[CSRC_] = icsrc;

  use_replicated_storage = FALSE;
  if (use_replicated_storage) {
    int ja_lu_proc;

    ja_lu_proc =   Cindxg2p(ja_lu,desc_hA[NB_],
        mypcol,desc_hA[CSRC_],npcol);

    desc_ipiv_hA[CSRC_] =  ja_lu_proc;

    desc_gipiv[RSRC_] = -1;
    desc_gipiv[CSRC_] = -1;
    scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1, desc_ipiv_hA, &inc1,
                        &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 );
    }
  else {
    /*
     * copy to 1 processors (rsrc,csrc), then
     * broadcast to all processors
     */
        int icontxt = desc_ipiv_hA[CTXT_];
        char scope = 'A'; 
        char top = ' ';
        int ntmp = 1;
        int lld; 

        int ia_lu_proc,ja_lu_proc;
        int rsrc, csrc;

        ia_lu_proc = Cindxg2p( ia_lu, desc_hA[MB_],
               myprow,desc_hA[RSRC_],nprow);
        ja_lu_proc = Cindxg2p( ja_lu, desc_hA[NB_],
               mypcol,desc_hA[CSRC_],npcol);

        rsrc = ia_lu_proc;
        csrc = ja_lu_proc;

        desc_gipiv[RSRC_] = rsrc;
        desc_gipiv[CSRC_] = csrc;
        desc_ipiv_hA[CSRC_] = csrc;

        mtmp = MIN( mm_lu, nn_lu);
        scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1,desc_ipiv_hA,&inc1,
                  &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 );

    if ((myprow == rsrc) && (mypcol == csrc)) {

        lld = mtmp;
        ntmp = 1;
        scalapack_igebs2d( &icontxt, &scope, &top,
            &mtmp, &ntmp, &(gipiv(i2)), &lld );
        }
    else {
      lld = mtmp;
      ntmp = 1;
      scalapack_igebr2d( &icontxt, &scope, &top,
            &mtmp, &ntmp, &(gipiv(i2)), &lld, 
            &rsrc,&csrc );
    };
  };

  if (idebug >= 1) {
    int desctmp[DLEN_];
    char name_ipiv_hA[] = "ipiv_hA";
    char name_gipiv[] = "gipiv";

    if (is_root) {
    printf("jstart %d jend %d \n", jstart,jend);
    printf("mm_lu %d nn_lu %d ia_lu %d ja_lu %d\n",
            mm_lu,   nn_lu,   ia_lu,   ja_lu );
    };

    Cdescset(desctmp, desc_hA[M_], npcol,
        desc_hA[MB_],1,
        desc_hA[RSRC_], desc_hA[CSRC_],
        desc_hA[CTXT_], desc_hA[LLD_] );

    Cpilaprnt( MIN(mm_lu,nn_lu), npcol, &(ipiv_hA(1)), 1,1,desctmp, name_ipiv_hA);

    Cdescset(desctmp, minmn*nprow, npcol,
        minmn, 1,    0,0,
        descA[CTXT_], minmn );
    Cpilaprnt( nprow*minmn, npcol, &(gipiv(1)),1,1,desctmp, name_gipiv);
  };


  /*
   * adjust pivot sequence from 1:min(mm,nn) in ipiv to 
   * jstart:(jstart+min(mm,nn)-1)
   */
    for(int i=1; i <= MIN(mm,nn); i++) {
      i2 = (jstart-1) + i;
      gipiv(i2) = gipiv(i2) + (jstart-1);
    };


  if (iinfo < 0) {
     *info = iinfo;
     return;
     };

  if ((*info == 0) && (iinfo > 0)) {
      *info = iinfo + (j-1);
      return;
      };


  /*
   * transfer factored panel back to GPU device
   */

  iia = (ia-1) + j;
  jja = (ja-1) + j;
  ii = 1;
  jj = 1;
  PROFSTART("gpu:A <- hA");
  Cpsgecopy_h2d(mm,nn, hA, ii,jj, desc_hA,
                       A, iia,jja, descA );
  PROFEND("gpu:A <- hA");





  if (use_delayed_left_interchange) {
    /*
     * do nothing for now
     */
    }
  else {
    /* 
     * apply interchanges to columns 1:(j-1)
     */

    nn = j-1;
    k1 = j;
    k2 = j + jb-1;
    incx = 1;


    PROFSTART("gpu:left swap");
    if (nn >= 1) {
         iia = (ia-1) + 1;
         jja = (ja-1) + 1;
         for(kk=k1; kk <= k2; kk++) {
           ip = gipiv(  kk);
           assert(ip >= kk );
           assert( ip <= m );

           if (kk != ip) {
               inc1 = descA[M_];
               inc2 = descA[M_];
               i1 = (iia-1) + kk;
               i2 = (iia-1) + ip;
               j1 = jja;
               j2 = jja;
               Cpsswap_gpu(nn, A,i1,j1,descA,inc1,
                               A,i2,j2,descA,inc2 );
                };
         };
      };
    PROFEND("gpu:left swap");
    };




  /*
   * apply interchanges to columns (j+jb):n
   */

   nn = n - (jend + 1) + 1;
   k1 = j;
   k2 = j + jb - 1;
   incx = 1;



   PROFSTART("gpu:right swap");
   if (nn >= 1) {
      iia = (ia-1) + 1;
      jja = (ja-1) + (jend+1);
      for(kk=k1; kk <= k2; kk++) {
        ip = gipiv(  kk );
        assert( ip >= kk );
        assert( ip <= m );

        if (ip != kk) {
           i1 = (iia-1) + kk;
           i2 = (iia-1) + ip;
           j1 = jja;
           j2 = jja;
           inc1 = descA[M_];
           inc2 = descA[M_];
           Cpsswap_gpu( nn, A, i1,j1, descA, inc1,
                            A, i2,j2, descA, inc2 );
        };
      };
   };
   PROFEND("gpu:right swap");


   PROFSTART("gpu:pTRSM");


   mm = jb;
   nn = n - (jend+1) + 1;
   if ( (1 <= mm) && (1 <= nn)) {
               /*
               cublasCtrsm('L','L','N','U', mm,nn,
                  alpha, dA(j,j), lddA, dA(j,j+jb), lddA );
               */

     if (use_broadcast_triangular_matrix) {
       /*
        * broadcast triangular part, then solve locally
        */
         char lscope = 'A';
         char ltop = ' ';
         int  msize, nsize, lr1,lc1, lr2,lc2;
         int ia_lu_proc, ja_lu_proc;

       /*
        * copy on local processor
        */

         ia_lu_proc = Cindxg2p(ia_lu, desc_hA[MB_], myprow,
                         desc_hA[RSRC_], nprow );
         ja_lu_proc = Cindxg2p(ja_lu, desc_hA[NB_], mypcol,
                         desc_hA[CSRC_], npcol );

       /*
        * complete mm by mm block on Atmp
        */
       ldAtmp = MAX(1,mm);
       Cdescset(desc_Atmp, mm,mm, mm,mm, 
           ia_lu_proc,ja_lu_proc, icontxt, ldAtmp);
       isizeAtmp = ldAtmp * MAX(1,mm);
       nbytes = isizeAtmp;
       nbytes *= elemSize;

       if (Atmp != 0) { free(Atmp); Atmp = 0; };
       Atmp = (float *) malloc( nbytes );
       assert( Atmp != 0);

#ifdef USE_CUBLASV2
       {
         cudaError_t ierr;
         size_t isize = isizeAtmp;
         isize *= elemSize;

         ierr = cudaMalloc( (void **) &dAtmp, isize );
         assert(ierr == cudaSuccess );
       }
#else
       cu_status = cublasAlloc(isizeAtmp, elemSize, (void **) &dAtmp );
       CHKERR(cu_status);
       assert( dAtmp != 0);
#endif

       ii = 1;
       jj = 1;
       scalapack_psgeadd( notrans, &mm, &mm, 
           one,   hA, &ia_lu, &ja_lu, desc_hA,
           zero,  Atmp, &ii, &jj, desc_Atmp );
                 
       rsrc = desc_Atmp[RSRC_];
       csrc = desc_Atmp[CSRC_];
       if ((myprow == rsrc) && (mypcol == csrc)) {
          scalapack_cgebs2d( &icontxt, &lscope, &ltop,   
              &mm, &mm,  Atmp, &ldAtmp );
          }
       else {
         scalapack_cgebr2d( &icontxt, &lscope, &ltop,
              &mm, &mm, Atmp, &ldAtmp,   &rsrc, &csrc );
       };

       inc1 = 1;
       inc2 = 1;
       cu_status = cublasSetVector(isizeAtmp, elemSize, Atmp, inc1, dAtmp, inc2 );
       CHKERR(cu_status);

       /*
        * perform local solve on GPU
        */
       iia = (ia-1) + j;
       jja = (ja-1) + (j+jb);
       local_extent( mm,nn, iia,jja,descA,  
                    &msize,&nsize, &lr1,&lc1, &lr2,&lc2 );
       if (msize >= 1) {
         assert( msize == mm );
       };

       if ((msize >= 1) && (nsize >= 1)) {
         char lside = 'L';
         char luplo = 'L';
         char ltrans = 'N';
         char ldiag = 'U';

         float zalpha;


         zalpha = (float)1.0;//make_float(1.0,0.0);

         CUBLAS_STRSM( 
             ((lside == 'l')||(lside == 'L')) ?
                CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT, 
             ((luplo == 'l')||(luplo == 'L')) ? 
                CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER,
             ((ltrans == 'c')||(ltrans == 'C')) ?
               CUBLAS_OP_C :
                 ((ltrans == 't')||(ltrans == 'T')) ?
                    CUBLAS_OP_T : CUBLAS_OP_N, 
             ((ldiag == 'u')||(ldiag == 'U')) ?
                CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT,
              mm, nsize, zalpha,
              (float *) dAtmp, ldAtmp,
              dA(lr1,lc1), descA[LLD_] );

       };



       if (Atmp != 0) {
         free(Atmp); Atmp = 0;
       };

#ifdef USE_CUBLASV2
       {
         cudaError_t ierr;
         ierr = cudaFree( (void *) dAtmp );
         assert(ierr == cudaSuccess );
         dAtmp  = 0;
       }
#else
       cu_status = cublasFree( dAtmp );
       CHKERR(cu_status );
#endif


     }
     else {
         /*
          * perform triangular solve using scalapack
          */
         iia = (ia-1) + j;
         jja = (ja-1) + (j+jb);
        setup_desc(mm,nn,iia,jja,descA,  &isize, desc_Atmp );

        nbytes = elemSize;
        nbytes *= isize;
        if (Atmp != 0) {
          free(Atmp); Atmp = 0;
        };
        Atmp = (float *) malloc( nbytes );
        assert( Atmp != 0 );



         /*
          * copy to Atmp(1:mm,1:nn) <- dA(j:(j+mm-1),(j+jb):((j+jb)+nn-1))
          */


         ii = 1; jj = 1;
         PROFSTART("gpu:Atmp <- dA");
         Cpsgecopy_d2h( mm,nn,A,iia,jja,descA,
                           Atmp, ii,jj, desc_Atmp );
         PROFEND("gpu:Atmp <- dA");



         /*
          * perform triangular solve using scalapack
          */

          side = left;
          uplo = lower;
          trans = notrans;
          diag = unit;

          alpha = one;

          iha = 1; 
          jha = 1;
          ii = 1; 
          jj = 1;

          PROFSTART("gpu:pstrsm")
          scalapack_pstrsm( side, uplo, trans, diag, 
              &mm,&nn, alpha,    
              hA, &iha,&jha, desc_hA,
              Atmp,&ii,&jj,  desc_Atmp );
          PROFEND("gpu:pstrsm")
          

          /*
           * copy back to GPU
           */

          iia = (ia-1) + j;
          jja = (ja-1) + (j+jb);
          ii = 1; 
          jj = 1;

          PROFSTART("gpu:A <- Atmp");
          Cpsgecopy_h2d( mm,nn, Atmp,ii,jj,desc_Atmp,
                             A, iia,jja, descA );
          PROFEND("gpu:A <- Atmp");
     };
                           



     };
   PROFEND("gpu:pTRSM");


    /*
     * update trailing submatrix
     */


	alpha = neg_one;
	beta = one;
	mm = m-(jend+1) + 1;
	nn = n-(jend+1) + 1;
	kk = jb;

 
      if ((1 <= mm) && (1 <= nn) && (1 <= kk)) {
        
        /*
	 cublasSgemm('N','N',mm,nn,kk,
            alpha, dA(j+jb,j),lddA, dA(j,j+jb),lddA,
            beta, dA(j+jb,j+jb), lddA );
         */

        if (use_broadcast_triangular_matrix) {
          /*
           * Copy from GPU to Atmp
           */
          iia = (ia-1) + j;
          jja = (ja-1) + (j+jb);

          setup_desc( kk,nn, iia,jja, descA, &isizeAtmp, desc_Atmp);
          nbytes = isizeAtmp;
          nbytes *= elemSize;
          if (Atmp != 0) { free(Atmp); Atmp = 0; };
          Atmp = (float *) malloc( nbytes );
          assert( Atmp != 0);

          PROFSTART("gpu:Atmp <- A");
          Cpsgecopy_d2h( kk,nn, A,iia,jja,descA, 
                                Atmp,1,1,desc_Atmp );
          PROFEND("gpu:Atmp <- A");
        };


        iic = (ia-1) + (jend+1);
        jjc = (ja-1) + (jend+1);


       iha = jsize+1;
       jha = 1;
       iAtmp = 1; 
       jAtmp = 1;
     

          {
          char transA = 'N';
          char transB = 'N';

          PROFSTART("zgetrf_gpu:psgemm");
          Cpsgemm_hhd( transA, transB, mm,nn,kk, 
           alpha, hA, iha,jha, desc_hA, 
                  Atmp, iAtmp,jAtmp, desc_Atmp, 
           beta,  A, iic,jjc, descA );

          PROFEND("zgetrf_gpu:psgemm");
           };
       };



    if (Atmp != 0) {
       free(Atmp); Atmp = 0;
       };

    if (ipiv_hA_ != 0) {
       free( ipiv_hA_ ); ipiv_hA_ = 0;
       };
    if (hA != 0) {
      free(hA); hA = 0;
      };

   }; /* for (jstart) */


   if (use_delayed_left_interchange) {

     PROFSTART("gpu:dleft swap");
    for(j=1; j <= minmn; j = jend + 1) {
        jend = MIN( minmn, j+nnb-1);
        jsize = jend - j + 1;
        jb = jsize;
        /*
         * apply interchanges to columns 1:(j-1)
         */
   
        nn = j-1;
        k1 = j;
        k2 = j+jb-1;
        incx = 1;
   
   
        if (nn >= 1) {
         iia = (ia-1) + 1; 
         jja = (ja-1) + 1;
         for(kk=k1; kk <= k2; kk++) {
             ip = gipiv(kk);
             assert( ip >= kk );

             if (ip != kk) {
               inc1 = descA[M_];
               inc2 = descA[M_];
               i1 = (iia-1) + kk;
               i2 = (iia-1) + ip;
               j1 = jja;
               j2 = jja;
               Cpsswap_gpu(nn, A, i1,j1,descA, inc1, 
                               A, i2,j2,descA, inc2 );
             };
         };
        };
     }; /* end for j */
     PROFEND("gpu:dleft swap");
   }; /* end if use delayed left interchange */


   /*
    * adjust global pivot from 1:MIN(m,n) to ia:(ia + MIN(m,n)-1)
    * copy global vector back to distributed pivot vector
    */

   for(int j=1; j <= minmn; j++) {
     gipiv(j) = (ia-1) + gipiv(j);
   };


   lld = descA[MB_] + 
         Cnumroc( descA[M_], descA[MB_], myprow, descA[RSRC_], nprow);

   Cdescset( desc_ipiv, 
              descA[M_],1, 
              descA[MB_], 1, 
              descA[RSRC_], -1, descA[CTXT_], lld );

   i1 = 1; j1 = 1; inc1 = 1;
   i2 = ia; j2 = 1; inc2 = 1;
   mtmp = MIN(m,n);

   PROFSTART("gpu:ipiv");
   use_replicated_storage = FALSE;
   if (use_replicated_storage) {
     int msize,nsize,lr1,lc1,lr2,lc2, lrindx,iia;

     local_extent(MIN(m,n),n,ia,ja,descA, &msize,&nsize, &lr1,&lc1, &lr2,&lc2);
     if (msize >= 1) {
       for(lrindx=lr1; lrindx <= lr2; lrindx++) {
         iia = Cindxl2g( lrindx, descA[MB_], myprow, descA[RSRC_], nprow);
         ipiv(lrindx) =  gipiv( (iia-ia) + 1 );
         };
       };
     }
   else  {
     /*
      * copy to a column, then broadcast
      */
     char scope = 'R';
     char top = ' ';
     int Locp, Locq;
     int lld;
     int icontxt = desc_ipiv[CTXT_];

     desc_ipiv[CSRC_] = ja_proc;
     desc_gipiv[RSRC_] = ia_proc;
     desc_gipiv[CSRC_] = ja_proc;

     mtmp = MIN(m,n);
     scalapack_picopy( &mtmp, &(gipiv(1)), &i1,&j1, desc_gipiv, &inc1,
             &(ipiv(1)), &i2, &j2, desc_ipiv, &inc2 );

     if (idebug >= 1) {
       char cmatnm[] = "ipiv after picopy";
       if (is_root) {
         printf("ia_proc %d ja_proc %d i2 %d j2 %d \n",ia_proc,ja_proc,i2,j2);
       };
       Cpilaprnt( mtmp,1, &(ipiv(1)), i2,j2,desc_ipiv, cmatnm);
     };


     Locp = Cnumroc( ia + MIN(m,n)-1, desc_ipiv[MB_], 
                     myprow, desc_ipiv[RSRC_], nprow);
     lld = MAX(1,Locp);
     Locq = 1;
     if (npcol > 1) {
      if (mypcol == ja_proc) {

       scalapack_igebs2d( &icontxt, &scope, &top, 
           &Locp, &Locq,  &(ipiv(1)), &lld );
      }
      else {
       rsrc = myprow;
       scalapack_igebr2d( &icontxt, &scope, &top,
           &Locp, &Locq, &(ipiv(1)), &lld, &rsrc, &ja_proc );
      };
     };

   };
   PROFEND("gpu:ipiv");

     if (idebug >= 1) {
       int desctmp[DLEN_];
       char cmatnm[] = "final ipiv";
       Cdescset( desctmp, 
           descA[M_],npcol,
           descA[MB_],1,
           descA[RSRC_], descA[CSRC_],
           descA[CTXT_], descA[LLD_]);
       Cpilaprnt( MIN(m,n),npcol, &(ipiv(1)), ia,1,desctmp, cmatnm);
     };





  /*
   * clean up
   */
  if (Atmp != 0) {
       free(Atmp); Atmp = 0;
       };
  if (hA != 0) {
       free(hA); hA = 0;
       };
  if (ipiv_hA_ != 0) {
      free( ipiv_hA_ ); ipiv_hA_ = 0;
      };

  if (gipiv_ != 0) {
     free(gipiv_); gipiv_ = 0;
     };


  return;
}
void Cpcswap_gpu( int n, cuComplex *A, int ia,int ja,int *descA, int incA,
                    cuComplex *B, int ib,int jb,int *descB, int incB )
{
/*
 perform pcswap operation when
 both distributed arrays A and B are in device memory
 */



/*
 * allocate temporary space on host
 * then use pcswap for communication
 */

const int use_MallocHost = FALSE;

cublasStatus cu_status;
size_t nbytes;
int elemSize = sizeof( cuComplex );

float *Atmp = 0;
float *Btmp = 0;

int descAtmp[DLEN_];
int descBtmp[DLEN_];
int ldA, ldB, ldAtmp, ldBtmp;

int nprow,npcol,myprow,mypcol;
int Locp, Locq, lrindx, lcindx, mm,nn;
int LocpA, LocqA, lrindxA, lcindxA;
int LocpB, LocqB, lrindxB, lcindxB;
int isizeA, isizeB, rsrc, csrc;

int iia,jja, iib, jjb;
int incAtmp, incBtmp;
int lrA1,lcA1, lrA2,lcA2;
int lrB1,lcB1, lrB2,lcB2;

Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol );


/*
 * allocate storage for vector from A
 */

if (incA == 1) {
   /*
    *  This is a column vector
    */
   mm = n; nn = 1;
   }
else {
  /*
   * This is a row vector
   */
   mm = 1; nn = n;
   };
setup_desc(  mm,nn, ia,ja, descA,   &isizeA, descAtmp );

nbytes = elemSize;
nbytes *= isizeA;
if (use_MallocHost) {
  Atmp = (float *) MallocHost( nbytes );
  }
else {
  Atmp = (float *) malloc( nbytes );
  };
assert( Atmp != 0 );


/*
 * copy vector from A
 */

PROFSTART("swap:GetMatrix");

local_extent( mm,nn,ia,ja,descA,  &LocpA, &LocqA, &lrA1,&lcA1, &lrA2,&lcA2 );

lrindxA = lrA1;
lcindxA = lcA1;

ldA = descA[LLD_];
ldAtmp = descAtmp[LLD_];
if ( (LocpA >= 1) && (LocqA >= 1)) {
  /*
   * copy from GPU device to host CPU
   */
  cu_status = cublasGetMatrix( LocpA,LocqA, elemSize,
             dA(lrindxA,lcindxA), ldA,  Atmp, ldAtmp );

  CHKERR(cu_status);
  };



/*
 * allocate storage for vector from B
 */

Cblacs_gridinfo( descB[CTXT_], &nprow, &npcol, &myprow, &mypcol );

if (incB == 1) {
   /*
    *  This is a column vector
    */
   mm = n; nn = 1;
   }
else {
  /*
   * This is a row vector
   */
   mm = 1; nn = n;
   };
setup_desc( mm,nn, ib,jb,descB, &isizeB, descBtmp );

ldBtmp = descBtmp[LLD_];
ldB = descB[LLD_];

nbytes = elemSize;
nbytes *= isizeB;
if (use_MallocHost) {
  Btmp = (float *) MallocHost( nbytes );
  }
else {
  Btmp = (float *) malloc( nbytes );
  };
assert( Btmp != 0 );



/*
 * copy vector from B
 */

local_extent( mm,nn,ib,jb,descB,  &LocpB, &LocqB, &lrB1,&lcB1,  &lrB2,&lcB2 );

lrindxB = lrB1;
lcindxB = lcB1;



ldB = descB[LLD_];
ldBtmp = descBtmp[LLD_];
if ((LocpB >= 1) && (LocqB >= 1)) {
  /*
   * Copy from GPU to CPU host
   */
  cu_status = cublasGetMatrix(LocpB,LocqB,elemSize,
         dB(lrindxB,lcindxB), ldB, Btmp, ldBtmp );
  CHKERR(cu_status );
  };

PROFEND("swap:GetMatrix");

iia = 1; jja = 1;
iib = 1; jjb = 1;
if (incA == 1) {
   incAtmp = 1;
   }
else {
  incAtmp = descAtmp[M_];
};

if (incB == 1) {
   incBtmp = 1;
    }
else {
   incBtmp = descBtmp[M_];
};


PROFSTART("swap:pcswap");
scalapack_pcswap( &n, Atmp, &iia, &jja, descAtmp, &incAtmp,
                      Btmp, &iib, &jjb, descBtmp, &incBtmp );
PROFEND("swap:pcswap");


/*
 * copy from host CPU back to GPU
 */

PROFSTART("swap:SetMatrix");

if ((LocpA >= 1) && (LocqA >= 1)) {
  /*
   * Copy from CPU host to GPU device
   */
  cu_status = cublasSetMatrix( LocpA, LocqA, elemSize,
              Atmp, ldAtmp, dA(lrindxA,lcindxA), ldA );
  CHKERR(cu_status);
  };


if ((LocpB >= 1) && (LocqB >= 1)) {
  /*
   * Copy from CPU host to GPU device
   */
  cu_status = cublasSetMatrix( LocpB, LocqB, elemSize,
                 Btmp, ldBtmp, dB(lrindxB,lcindxB), ldB );
  CHKERR(cu_status);
  };

PROFEND("swap:SetMatrix");

/*
 * clean up
 */


if (Atmp != 0) {
  if (use_MallocHost) {
    FreeHost(Atmp);
    }
  else {
    free(Atmp); 
    };
  Atmp = 0;
  };

if (Btmp != 0) {
  if (use_MallocHost) {
    FreeHost(Btmp);
     }
  else {
    free(Btmp); 
  };
  Btmp = 0;
 };



return;


}
Пример #4
0
static void tintin(void)
{
    int i, result, maxfd;
    struct timeval tv;
    fd_set readfdmask;
#ifdef XTERM_TITLE
    struct session *lastsession=0;
#endif
    char kbdbuf[BUFFER_SIZE];
    WC ch;
    int inbuf=0;
    mbstate_t instate;

    memset(&instate, 0, sizeof(instate));

    for (;;)
    {
#ifdef XTERM_TITLE
        if (ui_own_output && activesession!=lastsession)
        {
            lastsession=activesession;
            if (activesession==nullsession)
                user_title(XTERM_TITLE, "(no session)");
            else
                user_title(XTERM_TITLE, activesession->name);
        }
#endif

        tv.tv_sec = check_events();
        tv.tv_usec = 0;

        maxfd=0;
        FD_ZERO(&readfdmask);
        if (!eofinput)
            FD_SET(0, &readfdmask);
        else if (activesession==nullsession)
            end_command(0, activesession);
        for (struct session *ses = sessionlist; ses; ses = ses->next)
        {
            if (ses==nullsession)
                continue;
            if (ses->nagle)
                flush_socket(ses);
            FD_SET(ses->socket, &readfdmask);
            if (ses->socket>maxfd)
                maxfd=ses->socket;
        }
        result = select(maxfd+1, &readfdmask, 0, 0, &tv);

        if (need_resize)
        {
            char buf[BUFFER_SIZE];

            user_resize();
            sprintf(buf, "#NEW SCREEN SIZE: %dx%d.", COLS, LINES);
            tintin_puts1(buf, activesession);
        }

        if (result == 0)
            continue;
        else if (result < 0 && errno == EINTR)
            continue;   /* Interrupted system call */
        else if (result < 0)
            syserr("select");

        if (FD_ISSET(0, &readfdmask))
        {
            PROFSTART;
            PROFPUSH("user interface");
            result=read(0, kbdbuf+inbuf, BUFFER_SIZE-inbuf);
            if (result==-1)
                myquitsig(0);
            if (result==0 && !isatty(0))
                eofinput=true;
            inbuf+=result;

            i=0;
            while (i<inbuf)
            {
                result=mbrtowc(&ch, kbdbuf+i, inbuf-i, &instate);
                if (result==-2)         /* incomplete but valid sequence */
                {
                    memmove(kbdbuf, kbdbuf+i, inbuf-i);
                    inbuf-=i;
                    goto partial;
                }
                else if (result==-1)    /* invalid sequence */
                {
                    ch=0xFFFD;
                    i++;
                    errno=0;
                    /* Shift by 1 byte.  We can use a more intelligent shift,
                     * but staying charset-agnostic makes the code simpler.
                     */
                }
                else if (result==0)     /* literal 0 */
                    i++; /* oops... bad ISO/ANSI, bad */
                else
                    i+=result;
                if (user_process_kbd(activesession, ch))
                {
                    hist_num=-1;

                    if (term_echoing || (got_more_kludge && done_input[0]))
                        /* got_more_kludge: echo any non-empty line */
                    {
                        if (activesession && *done_input)
                            if (strcmp(done_input, prev_command))
                                do_history(done_input, activesession);
                        if (activesession->echo)
                            echo_input(done_input);
                        if (activesession->logfile)
                            write_logf(activesession, done_input,
                                activesession->loginputprefix, activesession->loginputsuffix);
                    }
                    if (*done_input)
                        strcpy(prev_command, done_input);
                    aborting=false;
                    activesession = parse_input(done_input, false, activesession);
                    recursion=0;
                }
            }
            inbuf=0;
        partial:
            PROFEND(kbd_lag, kbd_cnt);
            PROFPOP;
        }
        for (struct session *ses = sessionlist; ses; ses = ses->next)
        {
            if (ses->socket && FD_ISSET(ses->socket, &readfdmask))
            {
                aborting=false;
                any_closed=false;
                do
                {
                    read_mud(ses);
                    if (any_closed)
                    {
                        any_closed=false;
                        goto after_read;
                        /* The remaining sessions will be done after select() */
                    }
#ifdef HAVE_ZLIB
                } while (ses->mccp_more);
#else
                } while (0);
#endif
            }
        }
    after_read:
        if (activesession->server_echo
            && (2-activesession->server_echo != gotpassword))
        {
            gotpassword= 2-activesession->server_echo;
            if (!gotpassword)
                got_more_kludge=false;
            user_passwd(gotpassword && !got_more_kludge);
            term_echoing=!gotpassword;
        }
    }