Esempio n. 1
0
static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners)
{
  PetscInt           *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2;
  PetscInt           size=stash->size,nsends;
  PetscErrorCode     ierr;
  PetscInt           count,*sindices,**rindices,i,j,idx,lastidx,l;
  PetscScalar        **rvalues,*svalues;
  MPI_Comm           comm = stash->comm;
  MPI_Request        *send_waits,*recv_waits,*recv_waits1,*recv_waits2;
  PetscMPIInt        *sizes,*nlengths,nreceives;
  PetscInt           *sp_idx,*sp_idy;
  PetscScalar        *sp_val;
  PetscMatStashSpace space,space_next;

  PetscFunctionBegin;
  {                             /* make sure all processors are either in INSERTMODE or ADDMODE */
    InsertMode addv;
    ierr = MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));CHKERRQ(ierr);
    if (addv == (ADD_VALUES|INSERT_VALUES)) SETERRQ(PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
    mat->insertmode = addv; /* in case this processor had no cache */
  }

  bs2 = stash->bs*stash->bs;

  /*  first count number of contributors to each processor */
  ierr = PetscCalloc1(size,&sizes);CHKERRQ(ierr);
  ierr = PetscCalloc1(size,&nlengths);CHKERRQ(ierr);
  ierr = PetscMalloc1(stash->n+1,&owner);CHKERRQ(ierr);

  i       = j    = 0;
  lastidx = -1;
  space   = stash->space_head;
  while (space) {
    space_next = space->next;
    sp_idx     = space->idx;
    for (l=0; l<space->local_used; l++) {
      /* if indices are NOT locally sorted, need to start search at the beginning */
      if (lastidx > (idx = sp_idx[l])) j = 0;
      lastidx = idx;
      for (; j<size; j++) {
        if (idx >= owners[j] && idx < owners[j+1]) {
          nlengths[j]++; owner[i] = j; break;
        }
      }
      i++;
    }
    space = space_next;
  }
  /* Now check what procs get messages - and compute nsends. */
  for (i=0, nsends=0; i<size; i++) {
    if (nlengths[i]) {
      sizes[i] = 1; nsends++;
    }
  }

  {PetscMPIInt *onodes,*olengths;
   /* Determine the number of messages to expect, their lengths, from from-ids */
   ierr = PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives);CHKERRQ(ierr);
   ierr = PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths);CHKERRQ(ierr);
   /* since clubbing row,col - lengths are multiplied by 2 */
   for (i=0; i<nreceives; i++) olengths[i] *=2;
   ierr = PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1);CHKERRQ(ierr);
   /* values are size 'bs2' lengths (and remove earlier factor 2 */
   for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2;
   ierr = PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2);CHKERRQ(ierr);
   ierr = PetscFree(onodes);CHKERRQ(ierr);
   ierr = PetscFree(olengths);CHKERRQ(ierr);}

  /* do sends:
      1) starts[i] gives the starting index in svalues for stuff going to
         the ith processor
  */
  ierr = PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices);CHKERRQ(ierr);
  ierr = PetscMalloc1(2*nsends,&send_waits);CHKERRQ(ierr);
  ierr = PetscMalloc2(size,&startv,size,&starti);CHKERRQ(ierr);
  /* use 2 sends the first with all_a, the next with all_i and all_j */
  startv[0] = 0; starti[0] = 0;
  for (i=1; i<size; i++) {
    startv[i] = startv[i-1] + nlengths[i-1];
    starti[i] = starti[i-1] + 2*nlengths[i-1];
  }

  i     = 0;
  space = stash->space_head;
  while (space) {
    space_next = space->next;
    sp_idx     = space->idx;
    sp_idy     = space->idy;
    sp_val     = space->val;
    for (l=0; l<space->local_used; l++) {
      j = owner[i];
      if (bs2 == 1) {
        svalues[startv[j]] = sp_val[l];
      } else {
        PetscInt    k;
        PetscScalar *buf1,*buf2;
        buf1 = svalues+bs2*startv[j];
        buf2 = space->val + bs2*l;
        for (k=0; k<bs2; k++) buf1[k] = buf2[k];
      }
      sindices[starti[j]]             = sp_idx[l];
      sindices[starti[j]+nlengths[j]] = sp_idy[l];
      startv[j]++;
      starti[j]++;
      i++;
    }
    space = space_next;
  }
  startv[0] = 0;
  for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1];

  for (i=0,count=0; i<size; i++) {
    if (sizes[i]) {
      ierr = MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++);CHKERRQ(ierr);
      ierr = MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++);CHKERRQ(ierr);
    }
  }
#if defined(PETSC_USE_INFO)
  ierr = PetscInfo1(NULL,"No of messages: %d \n",nsends);CHKERRQ(ierr);
  for (i=0; i<size; i++) {
    if (sizes[i]) {
      ierr = PetscInfo2(NULL,"Mesg_to: %d: size: %d bytes\n",i,nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
    }
  }
#endif
  ierr = PetscFree(nlengths);CHKERRQ(ierr);
  ierr = PetscFree(owner);CHKERRQ(ierr);
  ierr = PetscFree2(startv,starti);CHKERRQ(ierr);
  ierr = PetscFree(sizes);CHKERRQ(ierr);

  /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
  ierr = PetscMalloc1(2*nreceives,&recv_waits);CHKERRQ(ierr);

  for (i=0; i<nreceives; i++) {
    recv_waits[2*i]   = recv_waits1[i];
    recv_waits[2*i+1] = recv_waits2[i];
  }
  stash->recv_waits = recv_waits;

  ierr = PetscFree(recv_waits1);CHKERRQ(ierr);
  ierr = PetscFree(recv_waits2);CHKERRQ(ierr);

  stash->svalues         = svalues;
  stash->sindices        = sindices;
  stash->rvalues         = rvalues;
  stash->rindices        = rindices;
  stash->send_waits      = send_waits;
  stash->nsends          = nsends;
  stash->nrecvs          = nreceives;
  stash->reproduce_count = 0;
  PetscFunctionReturn(0);
}
Esempio n. 2
0
PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
{
  PetscErrorCode      ierr;
  Mat_MPIAIJ          *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
  Mat_SeqAIJ          *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
  Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
  Mat_SeqAIJ          *p_loc,*p_oth;
  Mat_PtAPMPI         *ptap;
  Mat_Merge_SeqsToMPI *merge;
  PetscInt            *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
  PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
  PetscInt            i,j,k,anz,pnz,apnz,nextap,row,*cj;
  MatScalar           *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
  PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
  MPI_Comm            comm;
  PetscMPIInt         size,rank,taga,*len_s;
  PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
  PetscInt            **buf_ri,**buf_rj;
  PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
  MPI_Request         *s_waits,*r_waits;
  MPI_Status          *status;
  MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
  PetscInt            *api,*apj,*coi,*coj;
  PetscInt            *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
  PetscBool           scalable;
#if defined(PTAP_PROFILE)
  PetscLogDouble t0,t1,t2,t3,t4,et2_AP=0.0,et2_PtAP=0.0,t2_0,t2_1,t2_2;
#endif

  PetscFunctionBegin;
  ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t0);CHKERRQ(ierr);
#endif
  ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);

  ptap = c->ptap;
  if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
  merge    = ptap->merge;
  apa      = ptap->apa;
  scalable = ptap->scalable;

  /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
  /*--------------------------------------------------*/
  if (ptap->reuse == MAT_INITIAL_MATRIX) {
    /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
    ptap->reuse = MAT_REUSE_MATRIX;
  } else { /* update numerical values of P_oth and P_loc */
    ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
    ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
  }
#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t1);CHKERRQ(ierr);
#endif

  /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
  /*--------------------------------------------------------------*/
  /* get data from symbolic products */
  p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
  p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
  pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a;
  pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;

  coi  = merge->coi; coj = merge->coj;
  ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);

  bi     = merge->bi; bj = merge->bj;
  owners = merge->rowmap->range;
  ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);  /* ba: Cseq->a */

  api = ptap->api; apj = ptap->apj;

  if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but fast */
    ierr = PetscInfo(C,"Using non-scalable dense axpy\n");CHKERRQ(ierr);
    /*-----------------------------------------------------------------------------------------------------*/
    for (i=0; i<am; i++) {
#if defined(PTAP_PROFILE)
      ierr = PetscTime(&t2_0);CHKERRQ(ierr);
#endif
      /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
      /*------------------------------------------------------------*/
      apJ = apj + api[i];

      /* diagonal portion of A */
      anz = adi[i+1] - adi[i];
      adj = ad->j + adi[i];
      ada = ad->a + adi[i];
      for (j=0; j<anz; j++) {
        row = adj[j];
        pnz = pi_loc[row+1] - pi_loc[row];
        pj  = pj_loc + pi_loc[row];
        pa  = pa_loc + pi_loc[row];

        /* perform dense axpy */
        valtmp = ada[j];
        for (k=0; k<pnz; k++) {
          apa[pj[k]] += valtmp*pa[k];
        }
        ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
      }

      /* off-diagonal portion of A */
      anz = aoi[i+1] - aoi[i];
      aoj = ao->j + aoi[i];
      aoa = ao->a + aoi[i];
      for (j=0; j<anz; j++) {
        row = aoj[j];
        pnz = pi_oth[row+1] - pi_oth[row];
        pj  = pj_oth + pi_oth[row];
        pa  = pa_oth + pi_oth[row];

        /* perform dense axpy */
        valtmp = aoa[j];
        for (k=0; k<pnz; k++) {
          apa[pj[k]] += valtmp*pa[k];
        }
        ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
      }
#if defined(PTAP_PROFILE)
      ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
      et2_AP += t2_1 - t2_0;
#endif

      /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
      /*--------------------------------------------------------------*/
      apnz = api[i+1] - api[i];
      /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
      pnz = po->i[i+1] - po->i[i];
      poJ = po->j + po->i[i];
      pA  = po->a + po->i[i];
      for (j=0; j<pnz; j++) {
        row = poJ[j];
        cnz = coi[row+1] - coi[row];
        cj  = coj + coi[row];
        ca  = coa + coi[row];
        /* perform dense axpy */
        valtmp = pA[j];
        for (k=0; k<cnz; k++) {
          ca[k] += valtmp*apa[cj[k]];
        }
        ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
      }

      /* put the value into Cd (diagonal part) */
      pnz = pd->i[i+1] - pd->i[i];
      pdJ = pd->j + pd->i[i];
      pA  = pd->a + pd->i[i];
      for (j=0; j<pnz; j++) {
        row = pdJ[j];
        cnz = bi[row+1] - bi[row];
        cj  = bj + bi[row];
        ca  = ba + bi[row];
        /* perform dense axpy */
        valtmp = pA[j];
        for (k=0; k<cnz; k++) {
          ca[k] += valtmp*apa[cj[k]];
        }
        ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
      }

      /* zero the current row of A*P */
      for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
#if defined(PTAP_PROFILE)
      ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
      et2_PtAP += t2_2 - t2_1;
#endif
    }
  } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
    ierr = PetscInfo(C,"Using scalable sparse axpy\n");CHKERRQ(ierr);
    /*-----------------------------------------------------------------------------------------*/
    pA=pa_loc;
    for (i=0; i<am; i++) {
#if defined(PTAP_PROFILE)
      ierr = PetscTime(&t2_0);CHKERRQ(ierr);
#endif
      /* form i-th sparse row of A*P */
      apnz = api[i+1] - api[i];
      apJ  = apj + api[i];
      /* diagonal portion of A */
      anz = adi[i+1] - adi[i];
      adj = ad->j + adi[i];
      ada = ad->a + adi[i];
      for (j=0; j<anz; j++) {
        row    = adj[j];
        pnz    = pi_loc[row+1] - pi_loc[row];
        pj     = pj_loc + pi_loc[row];
        pa     = pa_loc + pi_loc[row];
        valtmp = ada[j];
        nextp  = 0;
        for (k=0; nextp<pnz; k++) {
          if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
            apa[k] += valtmp*pa[nextp++];
          }
        }
        ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
      }
      /* off-diagonal portion of A */
      anz = aoi[i+1] - aoi[i];
      aoj = ao->j + aoi[i];
      aoa = ao->a + aoi[i];
      for (j=0; j<anz; j++) {
        row    = aoj[j];
        pnz    = pi_oth[row+1] - pi_oth[row];
        pj     = pj_oth + pi_oth[row];
        pa     = pa_oth + pi_oth[row];
        valtmp = aoa[j];
        nextp  = 0;
        for (k=0; nextp<pnz; k++) {
          if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
            apa[k] += valtmp*pa[nextp++];
          }
        }
        ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
      }
#if defined(PTAP_PROFILE)
      ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
      et2_AP += t2_1 - t2_0;
#endif

      /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
      /*--------------------------------------------------------------*/
      pnz = pi_loc[i+1] - pi_loc[i];
      pJ  = pj_loc + pi_loc[i];
      for (j=0; j<pnz; j++) {
        nextap = 0;
        row    = pJ[j]; /* global index */
        if (row < pcstart || row >=pcend) { /* put the value into Co */
          row = *poJ;
          cj  = coj + coi[row];
          ca  = coa + coi[row]; poJ++;
        } else {                            /* put the value into Cd */
          row = *pdJ;
          cj  = bj + bi[row];
          ca  = ba + bi[row]; pdJ++;
        }
        valtmp = pA[j];
        for (k=0; nextap<apnz; k++) {
          if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
        }
        ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
      }
      pA += pnz;
      /* zero the current row info for A*P */
      ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr);
#if defined(PTAP_PROFILE)
      ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
      et2_PtAP += t2_2 - t2_1;
#endif
    }
  }
#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t2);CHKERRQ(ierr);
#endif

  /* 3) send and recv matrix values coa */
  /*------------------------------------*/
  buf_ri = merge->buf_ri;
  buf_rj = merge->buf_rj;
  len_s  = merge->len_s;
  ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
  ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);

  ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
  for (proc=0,k=0; proc<size; proc++) {
    if (!len_s[proc]) continue;
    i    = merge->owners_co[proc];
    ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
    k++;
  }
  if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
  if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}

  ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
  ierr = PetscFree(r_waits);CHKERRQ(ierr);
  ierr = PetscFree(coa);CHKERRQ(ierr);
#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t3);CHKERRQ(ierr);
#endif

  /* 4) insert local Cseq and received values into Cmpi */
  /*------------------------------------------------------*/
  ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
  for (k=0; k<merge->nrecv; k++) {
    buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
    nrows       = *(buf_ri_k[k]);
    nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
    nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
  }

  for (i=0; i<cm; i++) {
    row  = owners[rank] + i; /* global row index of C_seq */
    bj_i = bj + bi[i];  /* col indices of the i-th row of C */
    ba_i = ba + bi[i];
    bnz  = bi[i+1] - bi[i];
    /* add received vals into ba */
    for (k=0; k<merge->nrecv; k++) { /* k-th received message */
      /* i-th row */
      if (i == *nextrow[k]) {
        cnz    = *(nextci[k]+1) - *nextci[k];
        cj     = buf_rj[k] + *(nextci[k]);
        ca     = abuf_r[k] + *(nextci[k]);
        nextcj = 0;
        for (j=0; nextcj<cnz; j++) {
          if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
            ba_i[j] += ca[nextcj++];
          }
        }
        nextrow[k]++; nextci[k]++;
        ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
      }
    }
    ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

  ierr = PetscFree(ba);CHKERRQ(ierr);
  ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
  ierr = PetscFree(abuf_r);CHKERRQ(ierr);
  ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t4);CHKERRQ(ierr);
  if (rank==1) PetscPrintf(MPI_COMM_SELF,"  [%d] PtAPNum %g/P + %g/PtAP( %g + %g ) + %g/comm + %g/Cloc = %g\n\n",rank,t1-t0,t2-t1,et2_AP,et2_PtAP,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr);
#endif
  PetscFunctionReturn(0);
}