static void pdgstrf2
/************************************************************************/
(
 superlu_options_t *options,
 int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
 LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req,
 SuperLUStat_t *stat, int* info
 )
/* 
 * Purpose
 * =======
 *
 *   Panel factorization -- block column k
 *   Factor diagonal and subdiagonal blocks and test for exact singularity.
 *   Only the column processes that owns block column *k* participate
 *   in the work.
 * 
 * Arguments
 * =========
 *
 * k      (input) int (global)
 *        The column number of the block column to be factorized.
 *
 * thresh (input) double (global)
 *        The threshold value = s_eps * anorm.
 *
 * Glu_persist (input) Glu_persist_t*
 *        Global data structures (xsup, supno) replicated on all processes.
 *
 * grid   (input) gridinfo_t*
 *        The 2D process mesh.
 *
 * Llu    (input/output) LocalLU_t*
 *        Local data structures to store distributed L and U matrices.
 *
 * U_diag_blk_send_req (input/output) MPI_Request*
 *        List of send requests to send down the diagonal block of U.
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics about the factorization.
 *        See SuperLUStat_t structure defined in util.h.
 *
 * info   (output) int*
 *        = 0: successful exit
 *        < 0: if info = -i, the i-th argument had an illegal value
 *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
 *             been completed, but the factor U is exactly singular,
 *             and division by zero will occur if it is used to solve a
 *             system of equations.
 *
 */
{
    int    cols_left, iam, l, pkk, pr;
    int    incx = 1, incy = 1;
    int    nsupr; /* number of rows in the block (LDA) */
    int    luptr;
    int_t  i, krow, j, jfst, jlst, u_diag_cnt;
    int_t  nsupc; /* number of columns in the block */
    int_t  *xsup = Glu_persist->xsup;
    int_t  Pr;
    MPI_Status status;
    MPI_Comm comm = (grid->cscp).comm;
    double *lusup, temp;
    double *ujrow, *ublk_ptr; /* pointer to the U block */
    double alpha = -1;
    *info = 0;

    /* Quick return. */

    /* Initialization. */
    iam   = grid->iam;
    Pr    = grid->nprow;
    krow  = PROW( k, grid );
    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
    j     = LBj( k, grid ); /* Local block number */
    jfst  = FstBlockC( k );
    jlst  = FstBlockC( k+1 );
    lusup = Llu->Lnzval_bc_ptr[j];
    nsupc = SuperSize( k );
    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
    ublk_ptr = ujrow = Llu->ujrow;

    luptr = 0; /* point to the diagonal entries. */
    cols_left = nsupc; /* supernode size */
    u_diag_cnt = 0;

    if ( iam == pkk ) { /* diagonal process */

        if ( U_diag_blk_send_req && U_diag_blk_send_req[krow] ) {
	    /* There are pending sends - wait for all Isend to complete */
            for (pr = 0; pr < Pr; ++pr)
                if (pr != krow)
                    MPI_Wait(U_diag_blk_send_req + pr, &status);
        }

	for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */
	    
	    /* Diagonal pivot */
	    i = luptr;
	    if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
		if ( fabs(lusup[i]) < thresh ) {
#if ( PRNTlevel>=2 )
		    printf("(%d) .. col %d, tiny pivot %e  ",
			   iam, jfst+j, lusup[i]);
#endif
		    /* Keep the new diagonal entry with the same sign. */
		    if ( lusup[i] < 0 ) lusup[i] = -thresh;
		    else lusup[i] = thresh;
#if ( PRNTlevel>=2 )
		    printf("replaced by %e\n", lusup[i]);
#endif
		    ++(stat->TinyPivots);
		}
	    }

	    for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt)
                ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */

	    if ( ujrow[0] == 0.0 ) { /* Test for singularity. */
		*info = j+jfst+1;
	    } else { /* Scale the j-th column. */
		temp = 1.0 / ujrow[0];
		for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
		stat->ops[FACT] += nsupr-j-1;
	    }

	    /* Rank-1 update of the trailing submatrix. */
	    if ( --cols_left ) {
		l = nsupr - j - 1;
#ifdef _CRAY
		SGER(&l, &cols_left, &alpha, &lusup[luptr+1], &incx,
		     &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#else
		dger_(&l, &cols_left, &alpha, &lusup[luptr+1], &incx,
		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#endif
		stat->ops[FACT] += 2 * l * cols_left;

	    }
	    ujrow = ublk_ptr + u_diag_cnt;  /* move to next row of U */
	    luptr += nsupr + 1;	                 /* move to next column */

	} /* for column j ... */

	if ( U_diag_blk_send_req && iam == pkk ) { /* Send the U block */
	    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
	    for (pr = 0; pr < Pr; ++pr)
		if (pr != krow)
		    MPI_Isend(ublk_ptr, u_diag_cnt, MPI_DOUBLE, pr,
			      ((k<<2)+2)%NTAGS, comm, U_diag_blk_send_req + pr);
	    U_diag_blk_send_req[krow] = 1; /* flag outstanding Isend */
	}

    } else  { /* non-diagonal process */

	/* Receive the diagonal block of U */
        MPI_Recv(ublk_ptr, (nsupc*(nsupc+1))>>1, MPI_DOUBLE,
		 krow, ((k<<2)+2)%NTAGS, comm, &status);

	for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */
	    u_diag_cnt += cols_left;

	    if ( !lusup ) { /* empty block column */
		--cols_left;
		if ( ujrow[0] == 0.0 ) *info = j+jfst+1;
		continue;
	    }

	    /* Test for singularity. */
	    if ( ujrow[0] == 0.0 ) {
		*info = j+jfst+1;
	    } else {
		/* Scale the j-th column. */
		temp = 1.0 / ujrow[0];
		for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
		stat->ops[FACT] += nsupr;
	    }

	    /* Rank-1 update of the trailing submatrix. */
	    if ( --cols_left ) {
#ifdef _CRAY
		SGER(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, 
		     &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#else
		dger_(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, 
		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#endif
		stat->ops[FACT] += 2 * nsupr * cols_left;

	    }

	    ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */
	    luptr += nsupr;                      /* move to next column */

	} /* for column j ... */

    } /* end if pkk ... */

} /* PDGSTRF2 */
Exemple #2
0
/*! \brief
 *
 * <pre>
 * Purpose
 * =======
 *   Factor diagonal and subdiagonal blocks and test for exact singularity.
 *   Only the process column that owns block column *k* participates
 *   in the work.
 * 
 * Arguments
 * =========
 *
 * k      (input) int (global)
 *        The column number of the block column to be factorized.
 *
 * thresh (input) double (global)
 *        The threshold value = s_eps * anorm.
 *
 * Glu_persist (input) Glu_persist_t*
 *        Global data structures (xsup, supno) replicated on all processes.
 *
 * grid   (input) gridinfo_t*
 *        The 2D process mesh.
 *
 * Llu    (input/output) LocalLU_t*
 *        Local data structures to store distributed L and U matrices.
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics about the factorization.
 *        See SuperLUStat_t structure defined in util.h.
 *
 * info   (output) int*
 *        = 0: successful exit
 *        < 0: if info = -i, the i-th argument had an illegal value
 *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
 *             been completed, but the factor U is exactly singular,
 *             and division by zero will occur if it is used to solve a
 *             system of equations.
 * </pre>
 */
static void pdgstrf2
/************************************************************************/
(
 superlu_options_t *options,
 int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
 LocalLU_t *Llu, SuperLUStat_t *stat, int* info
 )

{
    int    c, iam, l, pkk;
    int    incx = 1, incy = 1;
    int    nsupr; /* number of rows in the block (LDA) */
    int    luptr;
    int_t  i, krow, j, jfst, jlst;
    int_t  nsupc; /* number of columns in the block */
    int_t  *xsup = Glu_persist->xsup;
    double *lusup, temp;
    double *ujrow;
    double alpha = -1;
    *info = 0;

    /* Quick return. */

    /* Initialization. */
    iam   = grid->iam;
    krow  = PROW( k, grid );
    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
    j     = LBj( k, grid ); /* Local block number */
    jfst  = FstBlockC( k );
    jlst  = FstBlockC( k+1 );
    lusup = Llu->Lnzval_bc_ptr[j];
    nsupc = SuperSize( k );
    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
    ujrow = Llu->ujrow;

    luptr = 0; /* Point to the diagonal entries. */
    c = nsupc;
    for (j = 0; j < jlst - jfst; ++j) {
	/* Broadcast the j-th row (nsupc - j) elements to
	   the process column. */
	if ( iam == pkk ) { /* Diagonal process. */
	    i = luptr;
	    if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
		if ( fabs(lusup[i]) < thresh ) { /* Diagonal */
#if ( PRNTlevel>=2 )
		    printf("(%d) .. col %d, tiny pivot %e  ",
			   iam, jfst+j, lusup[i]);
#endif
		    /* Keep the replaced diagonal with the same sign. */
		    if ( lusup[i] < 0 ) lusup[i] = -thresh;
		    else lusup[i] = thresh;
#if ( PRNTlevel>=2 )
		    printf("replaced by %e\n", lusup[i]);
#endif
		    ++(stat->TinyPivots);
		}
	    }
	    for (l = 0; l < c; ++l, i += nsupr)	ujrow[l] = lusup[i];
	}
#if 0
	dbcast_col(ujrow, c, pkk, UjROW, grid, &c);
#else
	MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm);
	/*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS,
		   grid, COMM_COLUMN, &c);*/
#endif

#if ( DEBUGlevel>=2 )
if ( k == 3329 && j == 2 ) {
	if ( iam == pkk ) {
	    printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]);
	} else {
	    printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]);
	}
}
#endif

	if ( !lusup ) { /* Empty block column. */
	    --c;
	    if ( ujrow[0] == 0.0 ) *info = j+jfst+1;
	    continue;
	}

	/* Test for singularity. */
	if ( ujrow[0] == 0.0 ) {
	    *info = j+jfst+1;
	} else {
	    /* Scale the j-th column of the matrix. */
	    temp = 1.0 / ujrow[0];
	    if ( iam == pkk ) {
		for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
		stat->ops[FACT] += nsupr-j-1;
	    } else {
		for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
		stat->ops[FACT] += nsupr;
	    }
	}
	    
	/* Rank-1 update of the trailing submatrix. */
	if ( --c ) {
	    if ( iam == pkk ) {
		l = nsupr - j - 1;
#ifdef _CRAY
		SGER(&l, &c, &alpha, &lusup[luptr+1], &incx,
		     &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#else
		dger_(&l, &c, &alpha, &lusup[luptr+1], &incx,
		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#endif
		stat->ops[FACT] += 2 * l * c;
	    } else {
#ifdef _CRAY
		SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
		     &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#else
		dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#endif
		stat->ops[FACT] += 2 * nsupr * c;
	    }
	}
	
	/* Move to the next column. */
	if ( iam == pkk ) luptr += nsupr + 1;
	else luptr += nsupr;

    } /* for j ... */

} /* PDGSTRF2 */
Exemple #3
0
static void pdgstrf2
/************************************************************************/
(
 superlu_options_t *options,
 int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
 LocalLU_t *Llu, SuperLUStat_t *stat, int* info
 )
/* 
 * Purpose
 * =======
 *   Factor diagonal and subdiagonal blocks and test for exact singularity.
 *   Only the process column that owns block column *k* participates
 *   in the work.
 * 
 * Arguments
 * =========
 *
 * k      (input) int (global)
 *        The column number of the block column to be factorized.
 *
 * thresh (input) double (global)
 *        The threshold value = s_eps * anorm.
 *
 * Glu_persist (input) Glu_persist_t*
 *        Global data structures (xsup, supno) replicated on all processes.
 *
 * grid   (input) gridinfo_t*
 *        The 2D process mesh.
 *
 * Llu    (input/output) LocalLU_t*
 *        Local data structures to store distributed L and U matrices.
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics about the factorization.
 *        See SuperLUStat_t structure defined in util.h.
 *
 * info   (output) int*
 *        = 0: successful exit
 *        < 0: if info = -i, the i-th argument had an illegal value
 *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
 *             been completed, but the factor U is exactly singular,
 *             and division by zero will occur if it is used to solve a
 *             system of equations.
 *
 */
{
    int    c, iam, l, pkk;
    int    incx = 1, incy = 1;
    int    nsupr; /* number of rows in the block (LDA) */
    int    luptr;
    int_t  i, krow, j, jfst, jlst;
    int_t  nsupc; /* number of columns in the block */
    int_t  *xsup = Glu_persist->xsup;
    double *lusup, temp;
    double *ujrow;
    double alpha = -1;
    *info = 0;

    /* Quick return. */

    /* Initialization. */
    iam   = grid->iam;
    krow  = PROW( k, grid );
    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
    j     = LBj( k, grid ); /* Local block number */
    jfst  = FstBlockC( k );
    jlst  = FstBlockC( k+1 );
    lusup = Llu->Lnzval_bc_ptr[j];
    nsupc = SuperSize( k );
    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
    ujrow = Llu->ujrow;

    luptr = 0; /* Point to the diagonal entries. */
    c = nsupc;
    for (j = 0; j < jlst - jfst; ++j) {
	/* Broadcast the j-th row (nsupc - j) elements to
	   the process column. */
	if ( iam == pkk ) { /* Diagonal process. */
	    i = luptr;
	    if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
		if ( fabs(lusup[i]) < thresh ) { /* Diagonal */
#if ( PRNTlevel>=2 )
		    printf("(%d) .. col %d, tiny pivot %e  ",
			   iam, jfst+j, lusup[i]);
#endif
		    /* Keep the replaced diagonal with the same sign. */
		    if ( lusup[i] < 0 ) lusup[i] = -thresh;
		    else lusup[i] = thresh;
#if ( PRNTlevel>=2 )
		    printf("replaced by %e\n", lusup[i]);
#endif
		    ++(stat->TinyPivots);
		}
	    }
	    for (l = 0; l < c; ++l, i += nsupr)	ujrow[l] = lusup[i];
	}
#if 0
	dbcast_col(ujrow, c, pkk, UjROW, grid, &c);
#else
	MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm);
	/*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS,
		   grid, COMM_COLUMN, &c);*/
#endif

#if ( DEBUGlevel>=2 )
if ( k == 3329 && j == 2 ) {
	if ( iam == pkk ) {
	    printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]);
	} else {
	    printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]);
	}
}
#endif

	if ( !lusup ) { /* Empty block column. */
	    --c;
	    if ( ujrow[0] == 0.0 ) *info = j+jfst+1;
	    continue;
	}

	/* Test for singularity. */
	if ( ujrow[0] == 0.0 ) {
	    *info = j+jfst+1;
	} else {
	    /* Scale the j-th column of the matrix. */
	    temp = 1.0 / ujrow[0];
	    if ( iam == pkk ) {
		for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
		stat->ops[FACT] += nsupr-j-1;
	    } else {
		for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
		stat->ops[FACT] += nsupr;
	    }
	}
	    
	/* Rank-1 update of the trailing submatrix. */
	if ( --c ) {
	    if ( iam == pkk ) {
		l = nsupr - j - 1;
#ifdef _CRAY
		SGER(&l, &c, &alpha, &lusup[luptr+1], &incx,
		     &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#elif defined (USE_VENDOR_BLAS)
		dger_(&l, &c, &alpha, &lusup[luptr+1], &incx,
		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#else
		hypre_F90_NAME_BLAS(dger,DGER)(&l, &c, &alpha, 
                      &lusup[luptr+1], &incx,
		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#endif
		stat->ops[FACT] += 2 * l * c;
	    } else {
#ifdef _CRAY
		SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
		     &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#elif defined (USE_VENDOR_BLAS)
		dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#else
		hypre_F90_NAME_BLAS(dger,DGER)(&nsupr, &c, &alpha, 
                      &lusup[luptr], &incx, 
		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#endif
		stat->ops[FACT] += 2 * nsupr * c;
	    }
	}
	
	/* Move to the next column. */
	if ( iam == pkk ) luptr += nsupr + 1;
	else luptr += nsupr;

    } /* for j ... */

} /* PDGSTRF2 */
Exemple #4
0
/*! \brief
 *
 * <pre>
 * Purpose
 * =======
 *   Panel factorization -- block column k
 *
 *   Factor diagonal and subdiagonal blocks and test for exact singularity.
 *   Only the column processes that own block column *k* participate
 *   in the work.
 * 
 * Arguments
 * =========
 *
 * k      (input) int (global)
 *        The column number of the block column to be factorized.
 *
 * thresh (input) double (global)
 *        The threshold value = s_eps * anorm.
 *
 * Glu_persist (input) Glu_persist_t*
 *        Global data structures (xsup, supno) replicated on all processes.
 *
 * grid   (input) gridinfo_t*
 *        The 2D process mesh.
 *
 * Llu    (input/output) LocalLU_t*
 *        Local data structures to store distributed L and U matrices.
 *
 * U_diag_blk_send_req (input/output) MPI_Request*
 *        List of send requests to send down the diagonal block of U.
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics about the factorization.
 *        See SuperLUStat_t structure defined in util.h.
 *
 * info   (output) int*
 *        = 0: successful exit
 *        < 0: if info = -i, the i-th argument had an illegal value
 *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
 *             been completed, but the factor U is exactly singular,
 *             and division by zero will occur if it is used to solve a
 *             system of equations.
 * </pre>
 */
static void pdgstrf2
(
 superlu_options_t *options,
 int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
 LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, 
 SuperLUStat_t *stat, int* info
 )
{
    int    cols_left, iam, l, pkk, pr;
    int    incx = 1, incy = 1;
    int    nsupr; /* number of rows in the block (LDA) */
    int    luptr;
    int_t  i, krow, j, jfst, jlst, u_diag_cnt;
    int_t  nsupc; /* number of columns in the block */
    int_t  *xsup = Glu_persist->xsup;
    double *lusup, temp;
    double *ujrow, *ublk_ptr; /* pointer to the U block */
    double alpha = -1, zero = 0.0;
    int_t  Pr;
    MPI_Status status;
    MPI_Comm comm = (grid->cscp).comm;

    /* Quick return. */
    *info = 0;

    /* Initialization. */
    iam   = grid->iam;
    Pr    = grid->nprow;
    krow  = PROW( k, grid );
    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
    j     = LBj( k, grid ); /* Local block number */
    jfst  = FstBlockC( k );
    jlst  = FstBlockC( k+1 );
    lusup = Llu->Lnzval_bc_ptr[j];
    nsupc = SuperSize( k );
    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
    ublk_ptr = ujrow = Llu->ujrow;

    luptr = 0; /* Point to the diagonal entries. */
    cols_left = nsupc; /* supernode size */
    u_diag_cnt = 0;

    if ( iam == pkk ) { /* diagonal process */

        if ( U_diag_blk_send_req && U_diag_blk_send_req[krow] ) {
	    /* There are pending sends - wait for all Isend to complete */
            for (pr = 0; pr < Pr; ++pr)
                if (pr != krow)
                    MPI_Wait(U_diag_blk_send_req + pr, &status);
        }

        for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */

	    /* Diagonal pivot */
	    i = luptr;
	    if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
		if ( fabs(lusup[i]) < thresh ) { /* Diagonal */
#if ( PRNTlevel>=2 )
		    printf("(%d) .. col %d, tiny pivot %e  ",
			   iam, jfst+j, lusup[i]);
#endif
		    /* Keep the new diagonal entry with the same sign. */
		    if ( lusup[i] < 0 ) lusup[i] = -thresh;
		    else lusup[i] = thresh;
#if ( PRNTlevel>=2 )
		    printf("replaced by %e\n", lusup[i]);
#endif
		    ++(stat->TinyPivots);
		}
	    }

	    for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt)
                ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */

	    if ( ujrow[0] == zero ) { /* Test for singularity. */
		*info = j+jfst+1;
	    } else { /* Scale the j-th column. */
		temp = 1.0 / ujrow[0];
		for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
		stat->ops[FACT] += nsupr-j-1;
	    }

	    /* Rank-1 update of the trailing submatrix. */
	    if ( --cols_left ) {
		l = nsupr - j - 1;
#ifdef _CRAY
		SGER(&l, &cols_left, &alpha, &lusup[luptr+1], &incx,
		     &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#else
		dger_(&l, &cols_left, &alpha, &lusup[luptr+1], &incx,
		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#endif
		stat->ops[FACT] += 2 * l * cols_left;
	    }
	    ujrow = ublk_ptr + u_diag_cnt;  /* move to next row of U */
	    luptr += nsupr + 1;	            /* move to next column */

	} /* for column j ... */

	if ( U_diag_blk_send_req && iam == pkk ) { /* Send the U block */
	    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
	    for (pr = 0; pr < Pr; ++pr)
		if (pr != krow)
		    MPI_Isend(ublk_ptr, u_diag_cnt, MPI_DOUBLE, pr,
			      ((k<<2)+2)%NTAGS, comm, U_diag_blk_send_req + pr);
	    U_diag_blk_send_req[krow] = 1; /* flag outstanding Isend */
	}

    } else  { /* non-diagonal process */

	/* Receive the diagonal block of U */
        MPI_Recv(ublk_ptr, (nsupc*(nsupc+1))>>1, MPI_DOUBLE,
		 krow, ((k<<2)+2)%NTAGS, comm, &status);

	for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */
	    u_diag_cnt += cols_left;

	    if ( !lusup ) { /* empty block column */
		--cols_left;
		if ( ujrow[0] == zero ) *info = j+jfst+1;
		continue;
	    }

	    /* Test for singularity. */
	    if ( ujrow[0] == zero ) {
		*info = j+jfst+1;
	    } else {
		/* Scale the j-th column. */
		temp = 1.0 / ujrow[0];
		for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
		stat->ops[FACT] += nsupr;
	    }

	    /* Rank-1 update of the trailing submatrix. */
	    if ( --cols_left ) {
#ifdef _CRAY
		SGER(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, 
		     &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#else
		dger_(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, 
		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#endif
		stat->ops[FACT] += 2 * nsupr * cols_left;
	    }

	    ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */
	    luptr += nsupr;                /* move to next column */

	} /* for column j ... */

    } /* end if pkk ... */

} /* PDGSTRF2 */