Пример #1
0
/*
 * Gather A from the distributed compressed row format to
 * global A in compressed column format.
 */
int pdCompRow_loc_to_CompCol_global
(
 int_t need_value, /* Input. Whether need to gather numerical values */
 SuperMatrix *A,   /* Input. Distributed matrix in NRformat_loc format. */
 gridinfo_t *grid, /* Input */
 SuperMatrix *GA   /* Output */
)
{
    NRformat_loc *Astore;
    NCformat *GAstore;
    double *a, *a_loc;
    int_t *colind, *rowptr;
    int_t *colptr_loc, *rowind_loc;
    int_t m_loc, n, i, j, k, l;
    int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz;
    double *a_recv;  /* Buffer to receive the blocks of values. */
    double *a_buf;   /* Buffer to merge blocks into block columns. */
    int_t *colcnt, *itemp;
    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
			   local block rows.
			   Use n_loc+1 pointers for each block. */
    int_t *colptr_blk;  /* The column pointers for each block, after
			   redistribution to the local block columns. 
			   Use n_loc+1 pointers for each block. */
    int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
    int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
    int_t *fst_rows, *n_locs;
    int   *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32;
    int   it, n_loc, procs;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(grid->iam, "Enter pdCompRow_loc_to_CompCol_global");
#endif

    /* Initialization. */
    n = A->ncol;
    Astore = (NRformat_loc *) A->Store;
    nnz_loc = Astore->nnz_loc;
    m_loc = Astore->m_loc;
    fst_row = Astore->fst_row;
    a = Astore->nzval;
    rowptr = Astore->rowptr;
    colind = Astore->colind;
    n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */

    /* ------------------------------------------------------------
       FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN.
       ------------------------------------------------------------*/
    dCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc,
                             &rowind_loc, &colptr_loc);
    /* Change local row index numbers to global numbers. */
    for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row;

#if ( DEBUGlevel>=2 )
    printf("Proc %d\n", grid->iam);
    PrintInt10("rowind_loc", nnz_loc, rowind_loc);
    PrintInt10("colptr_loc", n+1, colptr_loc);
#endif

    procs = grid->nprow * grid->npcol;
    if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) )
	  ABORT("Malloc fails for fst_rows[]");
    n_locs = fst_rows + procs;
    MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t,
		  grid->comm);
    for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i];
    n_locs[procs-1] = n - fst_rows[procs-1];
    if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) )
	  ABORT("Malloc fails for recvcnts[]");
    sendcnts = recvcnts + procs;
    rdispls = sendcnts + procs;
    sdispls = rdispls + procs;
    itemp_32 = sdispls + procs;

    /* All-to-all transfer column pointers of each block.
       Now the matrix view is P-by-P block-partition. */
    /* n column starts for each column, and procs column ends for each block */
    if ( !(colptr_send = intMalloc_dist(n + procs)) )
	   ABORT("Malloc fails for colptr_send[]");
    if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) )
	   ABORT("Malloc fails for colptr_blk[]");
    for (i = 0, j = 0; i < procs; ++i) {
        for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k];
	colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */
	sendcnts[i] = n_locs[i] + 1;
#if ( DEBUGlevel>=1 )
	assert(j == fst_rows[i]);
#endif
	sdispls[i] = j + i;
	recvcnts[i] = n_loc + 1;
	rdispls[i] = i * (n_loc + 1);
	j += n_locs[i]; /* First column of next block in colptr_loc[] */
    }
    MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t,
		  colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm);

    /* Adjust colptr_blk[] so that they contain the local indices of the
       column pointers in the receive buffer. */
    nnz = 0; /* The running sum of the nonzeros counted by far */
    k = 0;
    for (i = 0; i < procs; ++i) {
	for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) {
	    colnnz = colptr_blk[j+1] - colptr_blk[j];
	    /*assert(k<=j);*/
	    colptr_blk[k] = nnz;
	    nnz += colnnz; /* Start of the next column */
	    ++k;
	}
	colptr_blk[k++] = nnz; /* Add an END marker for each block */
    }
    /*assert(k == (n_loc+1)*procs);*/

    /* Now prepare to transfer row indices and values. */
    sdispls[0] = 0;
    for (i = 0; i < procs-1; ++i) {
        sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]];
	sdispls[i+1] = sdispls[i] + sendcnts[i];
    }
    sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]];
    for (i = 0; i < procs; ++i) {
        j = rdispls[i]; /* Point to this block in colptr_blk[]. */
	recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j];
    }
    rdispls[0] = 0; /* Recompute rdispls[] for row indices. */
    for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i];

    k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */
    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
        ABORT("Malloc fails for rowind_recv[]");
    rowind_buf = rowind_recv + k;
    MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t,
		  rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm);
    if ( need_value ) {
        if ( !(a_recv = (double *) doubleMalloc_dist(2*k)) )
	    ABORT("Malloc fails for rowind_recv[]");
	a_buf = a_recv + k;
	MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_DOUBLE,
                      a_recv, recvcnts, rdispls, MPI_DOUBLE,
                      grid->comm);
    }
      
    /* Reset colptr_loc[] to point to the n_loc global columns. */
    colptr_loc[0] = 0;
    itemp = colptr_send;
    for (j = 0; j < n_loc; ++j) {
        colnnz = 0;
	for (i = 0; i < procs; ++i) {
	    k = i * (n_loc + 1) + j; /* j-th column in i-th block */
	    colnnz += colptr_blk[k+1] - colptr_blk[k];
	}
	colptr_loc[j+1] = colptr_loc[j] + colnnz;
	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
    }
    itemp[n_loc] = colptr_loc[n_loc];
      
    /* Merge blocks of row indices into columns of row indices. */
    for (i = 0; i < procs; ++i) {
        k = i * (n_loc + 1);
	for (j = 0; j < n_loc; ++j) { /* i-th block */
	    for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
	        rowind_buf[itemp[j]] = rowind_recv[l];
		++itemp[j];
	    }
	}
    }

    if ( need_value ) {
        for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j];
        for (i = 0; i < procs; ++i) {
	    k = i * (n_loc + 1);
	    for (j = 0; j < n_loc; ++j) { /* i-th block */
	        for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
		    a_buf[itemp[j]] = a_recv[l];
		    ++itemp[j];
		}
	    }
	}
    }

    /* ------------------------------------------------------------
       SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT.
       ------------------------------------------------------------*/
    GA->nrow  = A->nrow;
    GA->ncol  = A->ncol;
    GA->Stype = SLU_NC;
    GA->Dtype = A->Dtype;
    GA->Mtype = A->Mtype;
    GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) );
    if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore");

    /* First gather the size of each piece. */
    nnz_loc = colptr_loc[n_loc];
    MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
    for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
    GAstore->nnz = nnz;
    
    if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
        ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
    if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
        ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
      
    /* Allgatherv for row indices. */
    rdispls[0] = 0;
    for (i = 0; i < procs-1; ++i) {
        rdispls[i+1] = rdispls[i] + itemp[i];
        itemp_32[i] = itemp[i];
    }
    itemp_32[procs-1] = itemp[procs-1];
    it = nnz_loc;
    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
		   itemp_32, rdispls, mpi_int_t, grid->comm);
    if ( need_value ) {
      if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) )
          ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
      MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, 
		     itemp_32, rdispls, MPI_DOUBLE, grid->comm);
    } else GAstore->nzval = NULL;

    /* Now gather the column pointers. */
    rdispls[0] = 0;
    for (i = 0; i < procs-1; ++i) {
        rdispls[i+1] = rdispls[i] + n_locs[i];
        itemp_32[i] = n_locs[i];
    }
    itemp_32[procs-1] = n_locs[procs-1];
    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
		   itemp_32, rdispls, mpi_int_t, grid->comm);

    /* Recompute column pointers. */
    for (i = 1; i < procs; ++i) {
        k = rdispls[i];
	for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1];
	itemp[i] += itemp[i-1]; /* prefix sum */
    }
    GAstore->colptr[n] = nnz;

#if ( DEBUGlevel>=2 )
    if ( !grid->iam ) {
        printf("After pdCompRow_loc_to_CompCol_global()\n");
	dPrint_CompCol_Matrix_dist(GA);
    }
#endif

    SUPERLU_FREE(a_loc);
    SUPERLU_FREE(rowind_loc);
    SUPERLU_FREE(colptr_loc);
    SUPERLU_FREE(fst_rows);
    SUPERLU_FREE(recvcnts);
    SUPERLU_FREE(colptr_send);
    SUPERLU_FREE(colptr_blk);
    SUPERLU_FREE(rowind_recv);
    if ( need_value) SUPERLU_FREE(a_recv);
#if ( DEBUGlevel>=1 )
    if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat));
    CHECK_MALLOC(grid->iam, "Exit pdCompRow_loc_to_CompCol_global");
#endif
    return 0;
} /* pdCompRow_loc_to_CompCol_global */
Пример #2
0
int zcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz,
			doublecomplex *nzval_g, int_t *rowind_g, int_t *colptr_g,
			gridinfo_t *grid)
{
    SuperMatrix GA;              /* global A */
    int_t    *rowind, *colptr;	 /* global */
    doublecomplex   *nzval;             /* global */
    doublecomplex   *nzval_loc;         /* local */
    int_t    *colind, *rowptr;	 /* local */
    int_t    m_loc, fst_row, nnz_loc;
    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
			   when mod(m, p) is not zero. */ 
    int_t    iam, row, col, i, j, relpos;
    char     trans[1];
    int_t    *marker;

    iam = grid->iam;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter zcreate_dist_matrix()");
#endif
 
    if ( !iam ) {

        /* Allocate storage for compressed column representation. */
        zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);

	/* Copy the global matrix. */
#if 0
	/* and ADJUST to 0-based indexing 
           which is required by the C routines.*/
#endif
        for(i=0; i<nnz; i++){
	  nzval[i]=nzval_g[i];
	  rowind[i]=rowind_g[i]; /* - 1;*/
        }
        for(i=0; i<n+1; i++)
	  colptr[i]=colptr_g[i]; /* - 1;*/


	/* Broadcast matrix A to the other PEs. */
	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( nzval,  nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
    } else {
	/* Receive matrix A from PE 0. */
	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );

	/* Allocate storage for compressed column representation. */
	zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);

	MPI_Bcast( nzval,   nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
    }

#if 0
    nzval[0]=0.1;
#endif

    /* Compute the number of rows to be distributed to local process */
    m_loc = m / (grid->nprow * grid->npcol); 
    m_loc_fst = m_loc;
    /* When m / procs is not an integer */
    if ((m_loc * grid->nprow * grid->npcol) != m) {
      m_loc = m_loc+1;
      m_loc_fst = m_loc;
      if (iam == (grid->nprow * grid->npcol - 1)) 
	m_loc = m - m_loc_fst * (grid->nprow * grid->npcol - 1);
    }

    /* Create compressed column matrix for GA. */
    zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
				SLU_NC, SLU_Z, SLU_GE);


    /*************************************************
     * Change GA to a local A with NR_loc format     *
     *************************************************/

    rowptr = (int_t *) intMalloc_dist(m_loc+1);
    marker = (int_t *) intCalloc_dist(n);

    /* Get counts of each row of GA */
    for (i = 0; i < n; ++i)
      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
    /* Set up row pointers */
    rowptr[0] = 0;
    fst_row = iam * m_loc_fst;
    nnz_loc = 0;
    for (j = 0; j < m_loc; ++j) {
      row = fst_row + j;
      rowptr[j+1] = rowptr[j] + marker[row];
      marker[j] = rowptr[j];
    }
    nnz_loc = rowptr[m_loc];

    nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
    colind = (int_t *) intMalloc_dist(nnz_loc);

    /* Transfer the matrix into the compressed row storage */
    for (i = 0; i < n; ++i) {
      for (j = colptr[i]; j < colptr[i+1]; ++j) {
	row = rowind[j];
	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
	  row = row - fst_row;
	  relpos = marker[row];
	  colind[relpos] = i;
	  nzval_loc[relpos] = nzval[j];
	  ++marker[row];
	}
      }
    }

#if ( DEBUGlevel>=1 )
    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
#endif   


    /* Destroy GA */
    Destroy_CompCol_Matrix_dist(&GA);


    /******************************************************/
    /* Change GA to a local A with NR_loc format */
    /******************************************************/

    /* Set up the local A in NR_loc format */
    zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
				   nzval_loc, colind, rowptr,
				   SLU_NR_loc, SLU_Z, SLU_GE);
    
    SUPERLU_FREE(marker);

#if ( DEBUGlevel>=1 )
    printf("sizeof(NRforamt_loc) %d\n", sizeof(NRformat_loc));
    CHECK_MALLOC(iam, "Exit dcreate_dist_matrix()");
#endif
    return 0;
}