/* * Gather A from the distributed compressed row format to * global A in compressed column format. */ int pdCompRow_loc_to_CompCol_global ( int_t need_value, /* Input. Whether need to gather numerical values */ SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ gridinfo_t *grid, /* Input */ SuperMatrix *GA /* Output */ ) { NRformat_loc *Astore; NCformat *GAstore; double *a, *a_loc; int_t *colind, *rowptr; int_t *colptr_loc, *rowind_loc; int_t m_loc, n, i, j, k, l; int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz; double *a_recv; /* Buffer to receive the blocks of values. */ double *a_buf; /* Buffer to merge blocks into block columns. */ int_t *colcnt, *itemp; int_t *colptr_send; /* Buffer to redistribute the column pointers of the local block rows. Use n_loc+1 pointers for each block. */ int_t *colptr_blk; /* The column pointers for each block, after redistribution to the local block columns. Use n_loc+1 pointers for each block. */ int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */ int_t *rowind_buf; /* Buffer to merge blocks into block columns. */ int_t *fst_rows, *n_locs; int *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32; int it, n_loc, procs; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pdCompRow_loc_to_CompCol_global"); #endif /* Initialization. */ n = A->ncol; Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; a = Astore->nzval; rowptr = Astore->rowptr; colind = Astore->colind; n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */ /* ------------------------------------------------------------ FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN. ------------------------------------------------------------*/ dCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc, &rowind_loc, &colptr_loc); /* Change local row index numbers to global numbers. */ for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row; #if ( DEBUGlevel>=2 ) printf("Proc %d\n", grid->iam); PrintInt10("rowind_loc", nnz_loc, rowind_loc); PrintInt10("colptr_loc", n+1, colptr_loc); #endif procs = grid->nprow * grid->npcol; if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) ) ABORT("Malloc fails for fst_rows[]"); n_locs = fst_rows + procs; MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t, grid->comm); for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i]; n_locs[procs-1] = n - fst_rows[procs-1]; if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) ) ABORT("Malloc fails for recvcnts[]"); sendcnts = recvcnts + procs; rdispls = sendcnts + procs; sdispls = rdispls + procs; itemp_32 = sdispls + procs; /* All-to-all transfer column pointers of each block. Now the matrix view is P-by-P block-partition. */ /* n column starts for each column, and procs column ends for each block */ if ( !(colptr_send = intMalloc_dist(n + procs)) ) ABORT("Malloc fails for colptr_send[]"); if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) ) ABORT("Malloc fails for colptr_blk[]"); for (i = 0, j = 0; i < procs; ++i) { for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k]; colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */ sendcnts[i] = n_locs[i] + 1; #if ( DEBUGlevel>=1 ) assert(j == fst_rows[i]); #endif sdispls[i] = j + i; recvcnts[i] = n_loc + 1; rdispls[i] = i * (n_loc + 1); j += n_locs[i]; /* First column of next block in colptr_loc[] */ } MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); /* Adjust colptr_blk[] so that they contain the local indices of the column pointers in the receive buffer. */ nnz = 0; /* The running sum of the nonzeros counted by far */ k = 0; for (i = 0; i < procs; ++i) { for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) { colnnz = colptr_blk[j+1] - colptr_blk[j]; /*assert(k<=j);*/ colptr_blk[k] = nnz; nnz += colnnz; /* Start of the next column */ ++k; } colptr_blk[k++] = nnz; /* Add an END marker for each block */ } /*assert(k == (n_loc+1)*procs);*/ /* Now prepare to transfer row indices and values. */ sdispls[0] = 0; for (i = 0; i < procs-1; ++i) { sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]]; sdispls[i+1] = sdispls[i] + sendcnts[i]; } sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]]; for (i = 0; i < procs; ++i) { j = rdispls[i]; /* Point to this block in colptr_blk[]. */ recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j]; } rdispls[0] = 0; /* Recompute rdispls[] for row indices. */ for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i]; k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */ if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) ) ABORT("Malloc fails for rowind_recv[]"); rowind_buf = rowind_recv + k; MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t, rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); if ( need_value ) { if ( !(a_recv = (double *) doubleMalloc_dist(2*k)) ) ABORT("Malloc fails for rowind_recv[]"); a_buf = a_recv + k; MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_DOUBLE, a_recv, recvcnts, rdispls, MPI_DOUBLE, grid->comm); } /* Reset colptr_loc[] to point to the n_loc global columns. */ colptr_loc[0] = 0; itemp = colptr_send; for (j = 0; j < n_loc; ++j) { colnnz = 0; for (i = 0; i < procs; ++i) { k = i * (n_loc + 1) + j; /* j-th column in i-th block */ colnnz += colptr_blk[k+1] - colptr_blk[k]; } colptr_loc[j+1] = colptr_loc[j] + colnnz; itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ } itemp[n_loc] = colptr_loc[n_loc]; /* Merge blocks of row indices into columns of row indices. */ for (i = 0; i < procs; ++i) { k = i * (n_loc + 1); for (j = 0; j < n_loc; ++j) { /* i-th block */ for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { rowind_buf[itemp[j]] = rowind_recv[l]; ++itemp[j]; } } } if ( need_value ) { for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j]; for (i = 0; i < procs; ++i) { k = i * (n_loc + 1); for (j = 0; j < n_loc; ++j) { /* i-th block */ for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { a_buf[itemp[j]] = a_recv[l]; ++itemp[j]; } } } } /* ------------------------------------------------------------ SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT. ------------------------------------------------------------*/ GA->nrow = A->nrow; GA->ncol = A->ncol; GA->Stype = SLU_NC; GA->Dtype = A->Dtype; GA->Mtype = A->Mtype; GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) ); if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore"); /* First gather the size of each piece. */ nnz_loc = colptr_loc[n_loc]; MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i]; GAstore->nnz = nnz; if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]"); if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]"); /* Allgatherv for row indices. */ rdispls[0] = 0; for (i = 0; i < procs-1; ++i) { rdispls[i+1] = rdispls[i] + itemp[i]; itemp_32[i] = itemp[i]; } itemp_32[procs-1] = itemp[procs-1]; it = nnz_loc; MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, itemp_32, rdispls, mpi_int_t, grid->comm); if ( need_value ) { if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, itemp_32, rdispls, MPI_DOUBLE, grid->comm); } else GAstore->nzval = NULL; /* Now gather the column pointers. */ rdispls[0] = 0; for (i = 0; i < procs-1; ++i) { rdispls[i+1] = rdispls[i] + n_locs[i]; itemp_32[i] = n_locs[i]; } itemp_32[procs-1] = n_locs[procs-1]; MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, itemp_32, rdispls, mpi_int_t, grid->comm); /* Recompute column pointers. */ for (i = 1; i < procs; ++i) { k = rdispls[i]; for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1]; itemp[i] += itemp[i-1]; /* prefix sum */ } GAstore->colptr[n] = nnz; #if ( DEBUGlevel>=2 ) if ( !grid->iam ) { printf("After pdCompRow_loc_to_CompCol_global()\n"); dPrint_CompCol_Matrix_dist(GA); } #endif SUPERLU_FREE(a_loc); SUPERLU_FREE(rowind_loc); SUPERLU_FREE(colptr_loc); SUPERLU_FREE(fst_rows); SUPERLU_FREE(recvcnts); SUPERLU_FREE(colptr_send); SUPERLU_FREE(colptr_blk); SUPERLU_FREE(rowind_recv); if ( need_value) SUPERLU_FREE(a_recv); #if ( DEBUGlevel>=1 ) if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat)); CHECK_MALLOC(grid->iam, "Exit pdCompRow_loc_to_CompCol_global"); #endif return 0; } /* pdCompRow_loc_to_CompCol_global */
int zcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz, doublecomplex *nzval_g, int_t *rowind_g, int_t *colptr_g, gridinfo_t *grid) { SuperMatrix GA; /* global A */ int_t *rowind, *colptr; /* global */ doublecomplex *nzval; /* global */ doublecomplex *nzval_loc; /* local */ int_t *colind, *rowptr; /* local */ int_t m_loc, fst_row, nnz_loc; int_t m_loc_fst; /* Record m_loc of the first p-1 processors, when mod(m, p) is not zero. */ int_t iam, row, col, i, j, relpos; char trans[1]; int_t *marker; iam = grid->iam; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter zcreate_dist_matrix()"); #endif if ( !iam ) { /* Allocate storage for compressed column representation. */ zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); /* Copy the global matrix. */ #if 0 /* and ADJUST to 0-based indexing which is required by the C routines.*/ #endif for(i=0; i<nnz; i++){ nzval[i]=nzval_g[i]; rowind[i]=rowind_g[i]; /* - 1;*/ } for(i=0; i<n+1; i++) colptr[i]=colptr_g[i]; /* - 1;*/ /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); } else { /* Receive matrix A from PE 0. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); /* Allocate storage for compressed column representation. */ zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); } #if 0 nzval[0]=0.1; #endif /* Compute the number of rows to be distributed to local process */ m_loc = m / (grid->nprow * grid->npcol); m_loc_fst = m_loc; /* When m / procs is not an integer */ if ((m_loc * grid->nprow * grid->npcol) != m) { m_loc = m_loc+1; m_loc_fst = m_loc; if (iam == (grid->nprow * grid->npcol - 1)) m_loc = m - m_loc_fst * (grid->nprow * grid->npcol - 1); } /* Create compressed column matrix for GA. */ zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, SLU_NC, SLU_Z, SLU_GE); /************************************************* * Change GA to a local A with NR_loc format * *************************************************/ rowptr = (int_t *) intMalloc_dist(m_loc+1); marker = (int_t *) intCalloc_dist(n); /* Get counts of each row of GA */ for (i = 0; i < n; ++i) for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; /* Set up row pointers */ rowptr[0] = 0; fst_row = iam * m_loc_fst; nnz_loc = 0; for (j = 0; j < m_loc; ++j) { row = fst_row + j; rowptr[j+1] = rowptr[j] + marker[row]; marker[j] = rowptr[j]; } nnz_loc = rowptr[m_loc]; nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); colind = (int_t *) intMalloc_dist(nnz_loc); /* Transfer the matrix into the compressed row storage */ for (i = 0; i < n; ++i) { for (j = colptr[i]; j < colptr[i+1]; ++j) { row = rowind[j]; if ( (row>=fst_row) && (row<fst_row+m_loc) ) { row = row - fst_row; relpos = marker[row]; colind[relpos] = i; nzval_loc[relpos] = nzval[j]; ++marker[row]; } } } #if ( DEBUGlevel>=1 ) if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); #endif /* Destroy GA */ Destroy_CompCol_Matrix_dist(&GA); /******************************************************/ /* Change GA to a local A with NR_loc format */ /******************************************************/ /* Set up the local A in NR_loc format */ zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, nzval_loc, colind, rowptr, SLU_NR_loc, SLU_Z, SLU_GE); SUPERLU_FREE(marker); #if ( DEBUGlevel>=1 ) printf("sizeof(NRforamt_loc) %d\n", sizeof(NRformat_loc)); CHECK_MALLOC(iam, "Exit dcreate_dist_matrix()"); #endif return 0; }