static void serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { P7_HMM *hmm = NULL; double *xv = NULL; /* results: array of N scores */ int *av = NULL; /* optional results: array of N alignment lengths */ char errbuf[eslERRBUFSIZE]; int status; if ((status = init_master_cfg(go, cfg, errbuf)) != eslOK) p7_Fail(errbuf); if ((xv = malloc(sizeof(double) * cfg->N)) == NULL) p7_Fail("allocation failed"); if (esl_opt_GetBoolean(go, "-a") && (av = malloc(sizeof(int) * cfg->N)) == NULL) p7_Fail("allocation failed"); while ((status = p7_hmmfile_Read(cfg->hfp, &(cfg->abc), &hmm)) != eslEOF) { if (status == eslEOD) p7_Fail("read failed, HMM file %s may be truncated?", cfg->hmmfile); else if (status == eslEFORMAT) p7_Fail("bad file format in HMM file %s", cfg->hmmfile); else if (status == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); if (cfg->bg == NULL) { if (esl_opt_GetBoolean(go, "--bgflat")) cfg->bg = p7_bg_CreateUniform(cfg->abc); else cfg->bg = p7_bg_Create(cfg->abc); p7_bg_SetLength(cfg->bg, esl_opt_GetInteger(go, "-L")); /* set the null model background length in both master and workers. */ } if (esl_opt_GetBoolean(go, "--recal")) { if (recalibrate_model(go, cfg, errbuf, hmm) != eslOK) p7_Fail(errbuf); } if (process_workunit(go, cfg, errbuf, hmm, xv, av) != eslOK) p7_Fail(errbuf); if (output_result (go, cfg, errbuf, hmm, xv, av) != eslOK) p7_Fail(errbuf); p7_hmm_Destroy(hmm); } free(xv); if (av != NULL) free(av); }
/* mpi_master() * The MPI version of hmmbuild. * Follows standard pattern for a master/worker load-balanced MPI program (J1/78-79). * * A master can only return if it's successful. * Errors in an MPI master come in two classes: recoverable and nonrecoverable. * * Recoverable errors include all worker-side errors, and any * master-side error that do not affect MPI communication. Error * messages from recoverable messages are delayed until we've cleanly * shut down the workers. * * Unrecoverable errors are master-side errors that may affect MPI * communication, meaning we cannot count on being able to reach the * workers and shut them down. Unrecoverable errors result in immediate * p7_Fail()'s, which will cause MPI to shut down the worker processes * uncleanly. */ static void mpi_master(const ESL_GETOPTS *go, struct cfg_s *cfg) { int xstatus = eslOK; /* changes from OK on recoverable error */ int status; int have_work = TRUE; /* TRUE while alignments remain */ int nproc_working = 0; /* number of worker processes working, up to nproc-1 */ int wi; /* rank of next worker to get an alignment to work on */ char *buf = NULL; /* input/output buffer, for packed MPI messages */ int bn = 0; ESL_MSA *msa = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; ESL_MSA **msalist = NULL; ESL_MSA *postmsa = NULL; int *msaidx = NULL; char errmsg[eslERRBUFSIZE]; MPI_Status mpistatus; int n; int pos; double entropy; /* Master initialization: including, figure out the alphabet type. * If any failure occurs, delay printing error message until we've shut down workers. */ if (xstatus == eslOK) { if ((status = init_master_cfg(go, cfg, errmsg)) != eslOK) xstatus = status; } if (xstatus == eslOK) { bn = 4096; if ((buf = malloc(sizeof(char) * bn)) == NULL) { sprintf(errmsg, "allocation failed"); xstatus = eslEMEM; } } if (xstatus == eslOK) { if ((msalist = malloc(sizeof(ESL_MSA *) * cfg->nproc)) == NULL) { sprintf(errmsg, "allocation failed"); xstatus = eslEMEM; } } if (xstatus == eslOK) { if ((msaidx = malloc(sizeof(int) * cfg->nproc)) == NULL) { sprintf(errmsg, "allocation failed"); xstatus = eslEMEM; } } MPI_Bcast(&xstatus, 1, MPI_INT, 0, MPI_COMM_WORLD); if (xstatus != eslOK) { MPI_Finalize(); p7_Fail(errmsg); } ESL_DPRINTF1(("MPI master is initialized\n")); bg = p7_bg_Create(cfg->abc); for (wi = 0; wi < cfg->nproc; wi++) { msalist[wi] = NULL; msaidx[wi] = 0; } /* Worker initialization: * Because we've already successfully initialized the master before we start * initializing the workers, we don't expect worker initialization to fail; * so we just receive a quick OK/error code reply from each worker to be sure, * and don't worry about an informative message. */ MPI_Bcast(&(cfg->abc->type), 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Reduce(&xstatus, &status, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD); if (status != eslOK) { MPI_Finalize(); p7_Fail("One or more MPI worker processes failed to initialize."); } ESL_DPRINTF1(("%d workers are initialized\n", cfg->nproc-1)); /* Main loop: combining load workers, send/receive, clear workers loops; * also, catch error states and die later, after clean shutdown of workers. * * When a recoverable error occurs, have_work = FALSE, xstatus != * eslOK, and errmsg is set to an informative message. No more * errmsg's can be received after the first one. We wait for all the * workers to clear their work units, then send them shutdown signals, * then finally print our errmsg and exit. * * Unrecoverable errors just crash us out with p7_Fail(). */ wi = 1; while (have_work || nproc_working) { if (have_work) { if ((status = esl_msa_Read(cfg->afp, &msa)) == eslOK) { cfg->nali++; ESL_DPRINTF1(("MPI master read MSA %s\n", msa->name == NULL? "" : msa->name)); } else { have_work = FALSE; if (status == eslEFORMAT) { xstatus = eslEFORMAT; snprintf(errmsg, eslERRBUFSIZE, "Alignment file parse error:\n%s\n", cfg->afp->errbuf); } else if (status == eslEINVAL) { xstatus = eslEFORMAT; snprintf(errmsg, eslERRBUFSIZE, "Alignment file parse error:\n%s\n", cfg->afp->errbuf); } else if (status != eslEOF) { xstatus = status; snprintf(errmsg, eslERRBUFSIZE, "Alignment file read unexpectedly failed with code %d\n", status); } ESL_DPRINTF1(("MPI master has run out of MSAs (having read %d)\n", cfg->nali)); } } if ((have_work && nproc_working == cfg->nproc-1) || (!have_work && nproc_working > 0)) { if (MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &mpistatus) != 0) { MPI_Finalize(); p7_Fail("mpi probe failed"); } if (MPI_Get_count(&mpistatus, MPI_PACKED, &n) != 0) { MPI_Finalize(); p7_Fail("mpi get count failed"); } wi = mpistatus.MPI_SOURCE; ESL_DPRINTF1(("MPI master sees a result of %d bytes from worker %d\n", n, wi)); if (n > bn) { if ((buf = realloc(buf, sizeof(char) * n)) == NULL) p7_Fail("reallocation failed"); bn = n; } if (MPI_Recv(buf, bn, MPI_PACKED, wi, 0, MPI_COMM_WORLD, &mpistatus) != 0) { MPI_Finalize(); p7_Fail("mpi recv failed"); } ESL_DPRINTF1(("MPI master has received the buffer\n")); /* If we're in a recoverable error state, we're only clearing worker results; * just receive them, don't unpack them or print them. * But if our xstatus is OK, go ahead and process the result buffer. */ if (xstatus == eslOK) { pos = 0; if (MPI_Unpack(buf, bn, &pos, &xstatus, 1, MPI_INT, MPI_COMM_WORLD) != 0) { MPI_Finalize(); p7_Fail("mpi unpack failed");} if (xstatus == eslOK) /* worker reported success. Get the HMM. */ { ESL_DPRINTF1(("MPI master sees that the result buffer contains an HMM\n")); if (p7_hmm_MPIUnpack(buf, bn, &pos, MPI_COMM_WORLD, &(cfg->abc), &hmm) != eslOK) { MPI_Finalize(); p7_Fail("HMM unpack failed"); } ESL_DPRINTF1(("MPI master has unpacked the HMM\n")); if (cfg->postmsafile != NULL) { if (esl_msa_MPIUnpack(cfg->abc, buf, bn, &pos, MPI_COMM_WORLD, &postmsa) != eslOK) { MPI_Finalize(); p7_Fail("postmsa unpack failed");} } entropy = p7_MeanMatchRelativeEntropy(hmm, bg); if ((status = output_result(cfg, errmsg, msaidx[wi], msalist[wi], hmm, postmsa, entropy)) != eslOK) xstatus = status; esl_msa_Destroy(postmsa); postmsa = NULL; p7_hmm_Destroy(hmm); hmm = NULL; } else /* worker reported an error. Get the errmsg. */ { if (MPI_Unpack(buf, bn, &pos, errmsg, eslERRBUFSIZE, MPI_CHAR, MPI_COMM_WORLD) != 0) { MPI_Finalize(); p7_Fail("mpi unpack of errmsg failed"); } ESL_DPRINTF1(("MPI master sees that the result buffer contains an error message\n")); } } esl_msa_Destroy(msalist[wi]); msalist[wi] = NULL; msaidx[wi] = 0; nproc_working--; } if (have_work) { ESL_DPRINTF1(("MPI master is sending MSA %s to worker %d\n", msa->name == NULL ? "":msa->name, wi)); if (esl_msa_MPISend(msa, wi, 0, MPI_COMM_WORLD, &buf, &bn) != eslOK) p7_Fail("MPI msa send failed"); msalist[wi] = msa; msaidx[wi] = cfg->nali; /* 1..N for N alignments in the MSA database */ msa = NULL; wi++; nproc_working++; } } /* On success or recoverable errors: * Shut down workers cleanly. */ ESL_DPRINTF1(("MPI master is done. Shutting down all the workers cleanly\n")); for (wi = 1; wi < cfg->nproc; wi++) if (esl_msa_MPISend(NULL, wi, 0, MPI_COMM_WORLD, &buf, &bn) != eslOK) p7_Fail("MPI msa send failed"); free(buf); free(msaidx); free(msalist); p7_bg_Destroy(bg); if (xstatus != eslOK) { MPI_Finalize(); p7_Fail(errmsg); } else return; }
/* serial_master() * The serial version of hmmbuild. * For each MSA, build an HMM and save it. * * A master can only return if it's successful. All errors are handled immediately and fatally with p7_Fail(). */ static int serial_master(const ESL_GETOPTS *go, struct cfg_s *cfg) { int status; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; #ifdef HMMER_THREADS WORK_ITEM *item = NULL; ESL_THREADS *threadObj= NULL; ESL_WORK_QUEUE *queue = NULL; #endif char errmsg[eslERRBUFSIZE]; if ((status = init_master_cfg(go, cfg, errmsg)) != eslOK) p7_Fail(errmsg); #ifdef HMMER_THREADS /* initialize thread data */ if (esl_opt_IsOn(go, "--cpu")) ncpus = esl_opt_GetInteger(go, "--cpu"); else esl_threads_CPUCount(&ncpus); if (ncpus > 0) { threadObj = esl_threads_Create(&pipeline_thread); queue = esl_workqueue_Create(ncpus * 2); } #endif infocnt = (ncpus == 0) ? 1 : ncpus; ESL_ALLOC(info, sizeof(*info) * infocnt); for (i = 0; i < infocnt; ++i) { info[i].bg = p7_bg_Create(cfg->abc); info[i].bld = p7_builder_Create(go, cfg->abc); if (info[i].bld == NULL) p7_Fail("p7_builder_Create failed"); #ifdef HMMER_THREADS info[i].queue = queue; if (ncpus > 0) esl_threads_AddThread(threadObj, &info[i]); #endif } #ifdef HMMER_THREADS for (i = 0; i < ncpus * 2; ++i) { ESL_ALLOC(item, sizeof(*item)); item->nali = 0; item->processed = FALSE; item->postmsa = NULL; item->msa = NULL; item->hmm = NULL; item->entropy = 0.0; status = esl_workqueue_Init(queue, item); if (status != eslOK) esl_fatal("Failed to add block to work queue"); } #endif #ifdef HMMER_THREADS if (ncpus > 0) status = thread_loop(threadObj, queue, cfg); else status = serial_loop(info, cfg); #else status = serial_loop(info, cfg); #endif if (status == eslEFORMAT) esl_fatal("Alignment file parse error:\n%s\n", cfg->afp->errbuf); else if (status == eslEINVAL) esl_fatal("Alignment file parse error:\n%s\n", cfg->afp->errbuf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); for (i = 0; i < infocnt; ++i) { p7_bg_Destroy(info[i].bg); p7_builder_Destroy(info[i].bld); } #ifdef HMMER_THREADS if (ncpus > 0) { esl_workqueue_Reset(queue); while (esl_workqueue_Remove(queue, (void **) &item) == eslOK) { free(item); } esl_workqueue_Destroy(queue); esl_threads_Destroy(threadObj); } #endif free(info); return eslOK; ERROR: return eslFAIL; }
/* mpi_master() * The MPI version of hmmsim. * Follows standard pattern for a master/worker load-balanced MPI program (J1/78-79). * * A master can only return if it's successful. * Errors in an MPI master come in two classes: recoverable and nonrecoverable. * * Recoverable errors include all worker-side errors, and any * master-side error that do not affect MPI communication. Error * messages from recoverable messages are delayed until we've cleanly * shut down the workers. * * Unrecoverable errors are master-side errors that may affect MPI * communication, meaning we cannot count on being able to reach the * workers and shut them down. Unrecoverable errors result in immediate * p7_Fail()'s, which will cause MPI to shut down the worker processes * uncleanly. */ static void mpi_master(ESL_GETOPTS *go, struct cfg_s *cfg) { int xstatus = eslOK; /* changes in the event of a recoverable error */ P7_HMM *hmm = NULL; /* query HMM */ P7_HMM **hmmlist = NULL; /* queue of HMMs being worked on, 1..nproc-1 */ char *wbuf = NULL; /* working buffer for sending packed profiles and receiving packed results. */ int wn = 0; double *xv = NULL; /* results: array of N scores */ int *av = NULL; /* optional results: array of N alignment lengths */ int have_work = TRUE; int nproc_working = 0; int wi; int pos; char errbuf[eslERRBUFSIZE]; int status; MPI_Status mpistatus; /* Master initialization. */ if (init_master_cfg(go, cfg, errbuf) != eslOK) p7_Fail(errbuf); if (minimum_mpi_working_buffer(go, cfg->N, &wn) != eslOK) p7_Fail("mpi pack sizes must have failed"); ESL_ALLOC(wbuf, sizeof(char) * wn); ESL_ALLOC(xv, sizeof(double) * cfg->N); if (esl_opt_GetBoolean(go, "-a")) ESL_ALLOC(av, sizeof(int) * cfg->N); ESL_ALLOC(hmmlist, sizeof(P7_HMM *) * cfg->nproc); for (wi = 0; wi < cfg->nproc; wi++) hmmlist[wi] = NULL; /* Standard design pattern for data parallelization in a master/worker model. (J1/78-79). */ wi = 1; while (have_work || nproc_working) { /* Get next work unit: one HMM, <hmm> */ if (have_work) { if ((status = p7_hmmfile_Read(cfg->hfp, &(cfg->abc), &hmm)) != eslOK) { have_work = FALSE; if (status == eslEOD) { xstatus = status; sprintf(errbuf, "read failed, HMM file %s may be truncated?", cfg->hmmfile); } else if (status == eslEFORMAT) { xstatus = status; sprintf(errbuf, "bad file format in HMM file %s", cfg->hmmfile); } else if (status == eslEINCOMPAT) { xstatus = status; sprintf(errbuf, "HMM file %s contains different alphabets", cfg->hmmfile); } else if (status != eslEOF) { xstatus = status; sprintf(errbuf, "Unexpected error in reading HMMs from %s", cfg->hmmfile); } if (cfg->bg == NULL) { // first time only if (esl_opt_GetBoolean(go, "--bgflat")) cfg->bg = p7_bg_CreateUniform(cfg->abc); else cfg->bg = p7_bg_Create(cfg->abc); } //this next step is redundant, but it avoids a race condition above. p7_bg_SetLength(cfg->bg, esl_opt_GetInteger(go, "-L")); /* set the null model background length in both master and workers. */ } } /* If we have work but no free workers, or we have no work but workers * are still working, then wait for a result to return from any worker. */ if ( (have_work && nproc_working == cfg->nproc-1) || (! have_work && nproc_working > 0)) { if (MPI_Recv(wbuf, wn, MPI_PACKED, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &mpistatus) != 0) p7_Fail("mpi recv failed"); wi = mpistatus.MPI_SOURCE; /* Check the xstatus before printing results. * If we're in a recoverable error state, we're only clearing worker results, prior to a clean failure */ if (xstatus == eslOK) { pos = 0; if (MPI_Unpack(wbuf, wn, &pos, &xstatus, 1, MPI_INT, MPI_COMM_WORLD) != 0) p7_Fail("mpi unpack failed"); if (xstatus == eslOK) /* worker reported success. Get the results. */ { if (MPI_Unpack(wbuf, wn, &pos, xv, cfg->N, MPI_DOUBLE, MPI_COMM_WORLD) != 0) p7_Fail("score vector unpack failed"); if (esl_opt_GetBoolean(go, "-a") && MPI_Unpack(wbuf, wn, &pos, av, cfg->N, MPI_INT, MPI_COMM_WORLD) != 0) p7_Fail("alilen vector unpack failed"); if ((status = output_result(go, cfg, errbuf, hmmlist[wi], xv, av)) != eslOK) xstatus = status; } else /* worker reported a user error. Get the errbuf. */ { if (MPI_Unpack(wbuf, wn, &pos, errbuf, eslERRBUFSIZE, MPI_CHAR, MPI_COMM_WORLD) != 0) p7_Fail("mpi unpack of errbuf failed"); have_work = FALSE; p7_hmm_Destroy(hmm); } } p7_hmm_Destroy(hmmlist[wi]); hmmlist[wi] = NULL; nproc_working--; } /* If we have work, assign it to a free worker; else, terminate the free worker. */ if (have_work) { p7_hmm_mpi_Send(hmm, wi, 0, MPI_COMM_WORLD, &wbuf, &wn); hmmlist[wi] = hmm; wi++; nproc_working++; } } /* Tell all the workers (1..nproc-1) to shut down by sending them a NULL workunit. */ for (wi = 1; wi < cfg->nproc; wi++) if (p7_hmm_mpi_Send(NULL, wi, 0, MPI_COMM_WORLD, &wbuf, &wn) != eslOK) p7_Fail("MPI HMM send failed"); free(hmmlist); free(wbuf); free(xv); if (av != NULL) free(av); if (xstatus != eslOK) p7_Fail(errbuf); else return; ERROR: if (hmmlist != NULL) free(hmmlist); if (wbuf != NULL) free(wbuf); if (xv != NULL) free(xv); if (av != NULL) free(av); p7_Fail("Fatal error in mpi_master"); }