/*@C PetscThreadReductionBegin - Initiates a threaded reduction and returns a reduction object to be passed to PetscThreadCommRunKernel Input Parameters: + comm - the MPI comm . op - the reduction operation . type - the data type for reduction - nreds - Number of reductions Output Parameters: . redout - the reduction context Level: developer Notes: See include/petscthreadcomm.h for the available reduction operations To be called from the main thread before calling PetscThreadCommRunKernel .seealso: PetscThreadCommReductionKernelPost(), PetscThreadCommReductionKernelEnd(), PetscThreadCommReductionEnd() @*/ PetscErrorCode PetscThreadReductionBegin(MPI_Comm comm,PetscThreadCommReductionOp op, PetscDataType type,PetscInt nreds,PetscThreadCommReduction *redout) { PetscErrorCode ierr; PetscThreadComm tcomm; PetscInt i; PetscThreadCommRedCtx redctx; PetscThreadCommReduction red; PetscFunctionBegin; ierr = PetscCommGetThreadComm(comm,&tcomm); CHKERRQ(ierr); red = tcomm->red; if(red->ctr+nreds > PETSC_REDUCTIONS_MAX) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Reductions in operation: %D Max. allowed: %D",red->ctr+nreds,PETSC_REDUCTIONS_MAX); for(i=red->ctr; i<red->ctr+nreds; i++) { redctx = &red->redctx[i]; redctx->op = op; redctx->type = type; redctx->red_status = THREADCOMM_REDUCTION_NEW; redctx->tcomm = tcomm; } red->nreds += nreds; red->ctr = red->ctr+nreds; *redout = red; PetscFunctionReturn(0); }
PetscErrorCode PetscSubcommCreate_interlaced(PetscSubcomm psubcomm) { PetscErrorCode ierr; PetscMPIInt rank,size,*subsize,duprank,subrank; PetscMPIInt np_subcomm,nleftover,i,j,color,nsubcomm=psubcomm->n; MPI_Comm subcomm=0,dupcomm=0,comm=psubcomm->parent; PetscFunctionBegin; ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); /* get size of each subcommunicator */ ierr = PetscMalloc((1+nsubcomm)*sizeof(PetscMPIInt),&subsize);CHKERRQ(ierr); np_subcomm = size/nsubcomm; nleftover = size - nsubcomm*np_subcomm; for (i=0; i<nsubcomm; i++) { subsize[i] = np_subcomm; if (i<nleftover) subsize[i]++; } /* find color for this proc */ color = rank%nsubcomm; subrank = rank/nsubcomm; ierr = MPI_Comm_split(comm,color,subrank,&subcomm);CHKERRQ(ierr); j = 0; duprank = 0; for (i=0; i<nsubcomm; i++) { if (j == color) { duprank += subrank; break; } duprank += subsize[i]; j++; } /* create dupcomm with same size as comm, but its rank, duprank, maps subcomm's contiguously into dupcomm */ ierr = MPI_Comm_split(comm,0,duprank,&dupcomm);CHKERRQ(ierr); { PetscThreadComm tcomm; ierr = PetscCommGetThreadComm(comm,&tcomm);CHKERRQ(ierr); ierr = MPI_Attr_put(dupcomm,Petsc_ThreadComm_keyval,tcomm);CHKERRQ(ierr); tcomm->refct++; ierr = MPI_Attr_put(subcomm,Petsc_ThreadComm_keyval,tcomm);CHKERRQ(ierr); tcomm->refct++; } ierr = PetscCommDuplicate(dupcomm,&psubcomm->dupparent,NULL);CHKERRQ(ierr); ierr = PetscCommDuplicate(subcomm,&psubcomm->comm,NULL);CHKERRQ(ierr); ierr = MPI_Comm_free(&dupcomm);CHKERRQ(ierr); ierr = MPI_Comm_free(&subcomm);CHKERRQ(ierr); psubcomm->color = color; psubcomm->subsize = subsize; psubcomm->type = PETSC_SUBCOMM_INTERLACED; PetscFunctionReturn(0); }
PetscErrorCode PetscSubcommCreate_contiguous(PetscSubcomm psubcomm) { PetscErrorCode ierr; PetscMPIInt rank,size,*subsize,duprank=-1,subrank=-1; PetscMPIInt np_subcomm,nleftover,i,color=-1,rankstart,nsubcomm=psubcomm->n; MPI_Comm subcomm=0,dupcomm=0,comm=psubcomm->parent; PetscFunctionBegin; ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); /* get size of each subcommunicator */ ierr = PetscMalloc((1+nsubcomm)*sizeof(PetscMPIInt),&subsize);CHKERRQ(ierr); np_subcomm = size/nsubcomm; nleftover = size - nsubcomm*np_subcomm; for (i=0; i<nsubcomm; i++) { subsize[i] = np_subcomm; if (i<nleftover) subsize[i]++; } /* get color and subrank of this proc */ rankstart = 0; for (i=0; i<nsubcomm; i++) { if (rank >= rankstart && rank < rankstart+subsize[i]) { color = i; subrank = rank - rankstart; duprank = rank; break; } else rankstart += subsize[i]; } ierr = MPI_Comm_split(comm,color,subrank,&subcomm);CHKERRQ(ierr); /* create dupcomm with same size as comm, but its rank, duprank, maps subcomm's contiguously into dupcomm */ ierr = MPI_Comm_split(comm,0,duprank,&dupcomm);CHKERRQ(ierr); { PetscThreadComm tcomm; ierr = PetscCommGetThreadComm(comm,&tcomm);CHKERRQ(ierr); ierr = MPI_Attr_put(dupcomm,Petsc_ThreadComm_keyval,tcomm);CHKERRQ(ierr); tcomm->refct++; ierr = MPI_Attr_put(subcomm,Petsc_ThreadComm_keyval,tcomm);CHKERRQ(ierr); tcomm->refct++; } ierr = PetscCommDuplicate(dupcomm,&psubcomm->dupparent,NULL);CHKERRQ(ierr); ierr = PetscCommDuplicate(subcomm,&psubcomm->comm,NULL);CHKERRQ(ierr); ierr = MPI_Comm_free(&dupcomm);CHKERRQ(ierr); ierr = MPI_Comm_free(&subcomm);CHKERRQ(ierr); psubcomm->color = color; psubcomm->subsize = subsize; psubcomm->type = PETSC_SUBCOMM_CONTIGUOUS; PetscFunctionReturn(0); }
EXTERN_C_END #undef __FUNCT__ #define __FUNCT__ "PetscThreadCommRunKernel_OpenMP" PetscErrorCode PetscThreadCommRunKernel_OpenMP(MPI_Comm comm,PetscThreadCommJobCtx job) { PetscErrorCode ierr; PetscThreadComm tcomm; PetscInt trank=0; PetscFunctionBegin; ierr = PetscCommGetThreadComm(comm,&tcomm);CHKERRQ(ierr); #pragma omp parallel num_threads(tcomm->nworkThreads) shared(comm,job) private(trank,ierr) { trank = omp_get_thread_num(); PetscRunKernel(trank,job->nargs,job); job->job_status[trank] = THREAD_JOB_COMPLETED; } PetscFunctionReturn(0); }
PETSC_EXTERN PetscErrorCode PetscThreadCommCreate_PThread(PetscThreadComm tcomm) { PetscThreadComm_PThread ptcomm; PetscErrorCode ierr; PetscInt i; PetscFunctionBegin; ptcommcrtct++; ierr = PetscStrcpy(tcomm->type,PTHREAD);CHKERRQ(ierr); ierr = PetscNew(&ptcomm);CHKERRQ(ierr); tcomm->data = (void*)ptcomm; ptcomm->nthreads = 0; ptcomm->sync = PTHREADSYNC_LOCKFREE; ptcomm->aff = PTHREADAFFPOLICY_ONECORE; ptcomm->spark = PTHREADPOOLSPARK_SELF; ptcomm->ismainworker = PETSC_TRUE; ptcomm->synchronizeafter = PETSC_TRUE; tcomm->ops->destroy = PetscThreadCommDestroy_PThread; tcomm->ops->runkernel = PetscThreadCommRunKernel_PThread_LockFree; tcomm->ops->barrier = PetscThreadCommBarrier_PThread_LockFree; tcomm->ops->getrank = PetscThreadCommGetRank_PThread; ierr = PetscMalloc1(tcomm->nworkThreads,&ptcomm->granks);CHKERRQ(ierr); if (!PetscPThreadCommInitializeCalled) { /* Only done for PETSC_THREAD_COMM_WORLD */ PetscBool flg1,flg2,flg3,flg4; PetscPThreadCommInitializeCalled = PETSC_TRUE; ierr = PetscOptionsBegin(PETSC_COMM_WORLD,NULL,"PThread communicator options",NULL);CHKERRQ(ierr); ierr = PetscOptionsBool("-threadcomm_pthread_main_is_worker","Main thread is also a worker thread",NULL,PETSC_TRUE,&ptcomm->ismainworker,&flg1);CHKERRQ(ierr); ierr = PetscOptionsEnum("-threadcomm_pthread_affpolicy","Thread affinity policy"," ",PetscPThreadCommAffinityPolicyTypes,(PetscEnum)ptcomm->aff,(PetscEnum*)&ptcomm->aff,&flg2);CHKERRQ(ierr); ierr = PetscOptionsEnum("-threadcomm_pthread_type","Thread pool type"," ",PetscPThreadCommSynchronizationTypes,(PetscEnum)ptcomm->sync,(PetscEnum*)&ptcomm->sync,&flg3);CHKERRQ(ierr); ierr = PetscOptionsEnum("-threadcomm_pthread_spark","Thread pool spark type"," ",PetscPThreadCommPoolSparkTypes,(PetscEnum)ptcomm->spark,(PetscEnum*)&ptcomm->spark,&flg4);CHKERRQ(ierr); ierr = PetscOptionsBool("-threadcomm_pthread_synchronizeafter","Puts a barrier after every kernel call",NULL,PETSC_TRUE,&ptcomm->synchronizeafter,&flg1);CHKERRQ(ierr); ierr = PetscOptionsEnd();CHKERRQ(ierr); if (ptcomm->ismainworker) { ptcomm->nthreads = tcomm->nworkThreads-1; ptcomm->thread_num_start = 1; } else { ptcomm->nthreads = tcomm->nworkThreads; ptcomm->thread_num_start = 0; } switch (ptcomm->sync) { case PTHREADSYNC_LOCKFREE: ptcomm->initialize = PetscPThreadCommInitialize_LockFree; ptcomm->finalize = PetscPThreadCommFinalize_LockFree; tcomm->ops->runkernel = PetscThreadCommRunKernel_PThread_LockFree; tcomm->ops->barrier = PetscThreadCommBarrier_PThread_LockFree; break; default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only Lock-free synchronization scheme supported currently"); } /* Set up thread ranks */ for (i=0; i< tcomm->nworkThreads; i++) ptcomm->granks[i] = i; if (ptcomm->ismainworker) { #if defined(PETSC_PTHREAD_LOCAL) PetscPThreadRank=0; /* Main thread rank */ #else ierr = pthread_key_create(&PetscPThreadRankkey,NULL);CHKERRQ(ierr); ierr = pthread_setspecific(PetscPThreadRankkey,&ptcomm->granks[0]);CHKERRQ(ierr); #endif } /* Set the leader thread rank */ if (ptcomm->nthreads) { if (ptcomm->ismainworker) tcomm->leader = ptcomm->granks[1]; else tcomm->leader = ptcomm->granks[0]; } /* Create array holding pthread ids */ ierr = PetscMalloc1(tcomm->nworkThreads,&ptcomm->tid);CHKERRQ(ierr); /* Create thread attributes */ ierr = PetscMalloc1(tcomm->nworkThreads,&ptcomm->attr);CHKERRQ(ierr); ierr = PetscThreadCommSetPThreadAttributes(tcomm);CHKERRQ(ierr); if (ptcomm->ismainworker) { /* Pin main thread */ #if defined(PETSC_HAVE_SCHED_CPU_SET_T) cpu_set_t mset; PetscInt ncores,icorr; ierr = PetscGetNCores(&ncores);CHKERRQ(ierr); CPU_ZERO(&mset); icorr = tcomm->affinities[0]%ncores; CPU_SET(icorr,&mset); sched_setaffinity(0,sizeof(cpu_set_t),&mset); #endif } /* Initialize thread pool */ ierr = (*ptcomm->initialize)(tcomm);CHKERRQ(ierr); } else { PetscThreadComm gtcomm; PetscThreadComm_PThread gptcomm; PetscInt *granks,j,*gaffinities; ierr = PetscCommGetThreadComm(PETSC_COMM_WORLD,>comm);CHKERRQ(ierr); gaffinities = gtcomm->affinities; gptcomm = (PetscThreadComm_PThread)tcomm->data; granks = gptcomm->granks; /* Copy over the data from the global thread communicator structure */ ptcomm->ismainworker = gptcomm->ismainworker; ptcomm->thread_num_start = gptcomm->thread_num_start; ptcomm->sync = gptcomm->sync; ptcomm->aff = gptcomm->aff; tcomm->ops->runkernel = gtcomm->ops->runkernel; tcomm->ops->barrier = gtcomm->ops->barrier; for (i=0; i < tcomm->nworkThreads; i++) { for (j=0; j < gtcomm->nworkThreads; j++) { if (tcomm->affinities[i] == gaffinities[j]) ptcomm->granks[i] = granks[j]; } } } PetscFunctionReturn(0); }