void P7ReadNullModel(char *rndfile, float *null, float *ret_p1) { FILE *fp; char *s; int x; int type = 0; if ((fp = fopen(rndfile, "r")) == NULL) Die("Failed to open null model file %s\n", rndfile); if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; s2upper(s); if (strcmp(s, "NUCLEIC") == 0) type = hmmNUCLEIC; else if (strcmp(s, "AMINO") == 0) type = hmmAMINO; else goto FAILURE; /* check/set alphabet type */ if (Alphabet_type == 0) SetAlphabet(type); else if (Alphabet_type != type) Die("Alphabet type conflict; null model in %s is inappropriate\n", rndfile); /* parse the file */ for (x = 0; x < Alphabet_size; x++) { if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE; null[x] = atof(s); } if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE; *ret_p1 = atof(s); fclose(fp); return; FAILURE: fclose(fp); Die("%s is not in HMMER null model file format", rndfile); }
void HMMCreateWPoolTask::runUnsafe() { const UHMMCalibrateSettings& settings = pt->getSettings(); WorkPool_s* wpool = pt->getWorkPool(); SetAlphabet(wpool->hmm->atype); sre_srandom(settings.seed); wpool->fixedlen = settings.fixedlen; wpool->hist = AllocHistogram(-200, 200, 100); wpool->lenmean = settings.lenmean; wpool->lensd = settings.lensd; wpool->nsample = settings.nsample; wpool->nseq = 0; wpool->randomseq.resize(MAXABET); wpool->max_score = -FLT_MAX; float p1; P7Logoddsify(wpool->hmm, TRUE); P7DefaultNullModel(wpool->randomseq.data(), &p1); }
/* Function: DetermineAlphabet() * * Purpose: From a set of sequences (raw or aligned), make a good * guess whether they're Nucleic, Amino, or something * else, and set alphabet accordingly. * * If Alphabet_type is already set, that means our * autodetection was overridden from the command line, * and we just set the other globals accordingly. */ void DetermineAlphabet(char **rseqs, int nseq) { int idx; int other, nucleic, amino; int type; /* Autodetection of alphabet type. */ type = hmmNOTSETYET; other = nucleic = amino = 0; for (idx = 0; idx < nseq; idx++) { switch (Seqtype(rseqs[idx])) { case kRNA: nucleic++; break; case kDNA: nucleic++; break; case kAmino: amino++; break; case kOtherSeq: other++; break; default: Die("No such alphabet type"); } } if (nucleic == nseq) type = hmmNUCLEIC; else if (amino == nseq) type = hmmAMINO; else if (nucleic > amino && nucleic > other) { Warn("Looks like nucleic acid sequence, hope that's right"); type = hmmNUCLEIC; } else if (amino > nucleic && amino > other) { Warn("Looks like amino acid sequence, hope that's right"); type = hmmAMINO; } else Die("Sorry, I can't tell if that's protein or DNA"); /* Now set up the alphabet. */ SetAlphabet(type); }
void CAlnReader::SetPhylip(EAlphabet alpha) { SetAlphabet(alpha); SetAllGap("-"); }
void CAlnReader::SetClustal(EAlphabet alpha) { SetAlphabet(alpha); SetAllGap("-"); }
void CAlnReader::SetFastaGap(EAlphabet alpha) { SetAlphabet(alpha); SetAllGap("-"); }
QList<UHMMSearchResult> UHMMSearch::search(plan7_s* _hmm, const char* seq, int seqLen, const UHMMSearchSettings& s, TaskStateInfo& si) { plan7_s * hmm = HMMIO::cloneHMM( _hmm ); //Set up optional Pfam score thresholds. threshold_s thresh; // contains all threshold (cutoff) info thresh.globE = s.globE; // use a reasonable Eval threshold thresh.globT = -FLT_MAX; // but no bit threshold thresh.domT = s.domT; // no domain bit threshold thresh.domE = s.domE; // and no domain Eval threshold thresh.autocut = CUT_NONE; // and no Pfam cutoffs used thresh.Z = s.eValueNSeqs; // Z not preset; use actual # of seqs int do_null2 = TRUE; // TRUE to adjust scores with null model #2 int do_forward = FALSE; // TRUE to use Forward() not Viterbi() int do_xnu = FALSE; // TRUE to filter sequences thru XNU QList<UHMMSearchResult> res; // the results of the method //get HMMERTaskLocalData HMMERTaskLocalData *tld = getHMMERTaskLocalData(); alphabet_s *al = &tld->al; SetAlphabet(hmm->atype); P7Logoddsify(hmm, !do_forward); //TODO: clone model to avoid changes in it or make it thread safe?? if (do_xnu && al->Alphabet_type == hmmNUCLEIC) { si.setError( "The HMM is a DNA model, and you can't use the --xnu filter on DNA data" ); return res; } /***************************************************************** * Set up optional Pfam score thresholds. * Can do this before starting any searches, since we'll only use 1 HMM. *****************************************************************/ if (!SetAutocuts(&thresh, hmm)) { si.setError( "HMM did not contain the GA, TC, or NC cutoffs you needed" ); return res; } // set up structures for storing output histogram_s *histogram = AllocHistogram(-200, 200, 100); //keeps full histogram of all scores tophit_s *ghit = AllocTophits(200); // per-seq hits: 200=lumpsize tophit_s *dhit = AllocTophits(200); // domain hits: 200=lumpsize int nseq = 0; // number of sequences searched #ifdef UGENE_CELL if( HMMSearchAlgo_CellOptimized == s.alg ) { if( hmm->M < MAX_HMM_LENGTH ) { main_loop_spe(hmm, seq, seqLen, &thresh, do_forward, do_null2, do_xnu, histogram, ghit, dhit, &nseq, si); } else { main_loop_serial(hmm, seq, seqLen, &thresh, do_forward, do_null2, do_xnu, histogram, ghit, dhit, &nseq, si); } } else #elif defined(HMMER_BUILD_WITH_SSE2) if( HMMSearchAlgo_SSEOptimized == s.alg ) { main_loop_opt(hmm, seq, seqLen, &thresh, do_forward, do_null2, do_xnu, histogram, ghit, dhit, &nseq, si, sseScoring); } else #endif if( HMMSearchAlgo_Conservative == s.alg ) { main_loop_serial(hmm, seq, seqLen, &thresh, do_forward, do_null2, do_xnu, histogram, ghit, dhit, &nseq, si); } else { assert( false && "bad hmmsearch algorithm selected" ); } // Process hit lists, produce text output // Set the theoretical EVD curve in our histogram using calibration in the HMM, if available. if (hmm->flags & PLAN7_STATS) { ExtremeValueSetHistogram(histogram, hmm->mu, hmm->lambda, histogram->lowscore, histogram->highscore, 0); } if (!thresh.Z) { thresh.Z = nseq; // set Z for good now that we're done } //report our output FullSortTophits(dhit); //int namewidth = MAX(8, TophitsMaxName(ghit)); // max width of sequence name // Report domain hits (sorted on E-value) for (int i = 0; i < dhit->num && !si.cancelFlag; i++) { float sc; // score of an HMM search double pvalue; // pvalue of an HMM score double evalue; // evalue of an HMM score char *name, *desc; // hit sequence name and description double motherp; // pvalue of a whole seq HMM score float mothersc; // score of a whole seq parent of domain int sqfrom, sqto; // coordinates in sequence int sqlen; // length of seq that was hit int hmmfrom, hmmto; // coordinate in HMM int ndom; // total # of domains in this seq int domidx; // number of this domain GetRankedHit(dhit, i, &pvalue, &sc, &motherp, &mothersc, &name, NULL, &desc, &sqfrom, &sqto, &sqlen, // seq position info &hmmfrom, &hmmto, NULL, // HMM position info &domidx, &ndom, // domain info NULL); // alignment info evalue = pvalue * (double) thresh.Z; if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT) { continue; } else if (evalue <= thresh.domE && sc >= thresh.domT) { // hmm reports results in range [1...N] -> translate it to [0..N) res.append(UHMMSearchResult(U2Region(sqfrom-1, sqto-sqfrom+1), sc, evalue)); } } //Clean-up and exit. FreeHistogram(histogram); FreeTophits(ghit); FreeTophits(dhit); FreePlan7( hmm ); return res; }
int main(void) { struct p7trace_s *tr; /* traceback of an alignment */ int master_tid; /* PVM TID of our master */ char *hmmfile; /* file to read HMM(s) from */ HMMFILE *hmmfp; /* opened hmmfile for reading */ struct plan7_s *hmm; char *seq; char *dsq; int len; int nhmm; /* number of HMM to work on */ float sc; int my_idx = -1; /* my index, 0..nslaves-1 */ float globT; /* T parameter: keep only hits > globT bits */ double globE; /* E parameter: keep hits < globE E-value */ double pvalue; /* Z*pvalue = Evalue */ int Z; /* nseq to base E value calculation on */ int send_trace; /* TRUE if score is significant */ int do_xnu; /* TRUE to do XNU filter on seq */ int do_forward; /* TRUE to use Forward() scores not Viterbi */ int do_null2; /* TRUE to correct scores w/ ad hoc null2 */ int alphatype; /* alphabet type, hmmAMINO or hmmNUCLEIC */ int code; /* return code after initialization */ /* Register leave_pvm() cleanup function so any exit() call * first calls pvm_exit(). */ if (atexit(leave_pvm) != 0) { pvm_exit(); Die("slave couldn't register leave_pvm()"); } /***************************************************************** * initialization. * Master broadcasts to us: * 1) len of HMM file name (int) * 2) name of HMM file (string) * 3) length of sequence string (int) * 4) sequence (string) * 5) globT threshold * 6) globE threshold * 7) Z * 8) do_xnu flag * 9) do_forward flag * 10) do_null2 flag * 11) alphabet type * We receive the broadcast and open the files. ******************************************************************/ master_tid = pvm_parent(); /* who's our master? */ pvm_recv(master_tid, HMMPVM_INIT); pvm_upkint(&len, 1, 1); hmmfile = MallocOrDie(sizeof(char *) * (len+1)); pvm_upkstr(hmmfile); pvm_upkint(&len, 1, 1); seq = MallocOrDie(sizeof(char *) * (len+1)); pvm_upkstr(seq); pvm_upkfloat(&globT, 1, 1); pvm_upkdouble(&globE, 1, 1); pvm_upkint(&Z, 1, 1); pvm_upkint(&do_xnu, 1, 1); pvm_upkint(&do_forward, 1, 1); pvm_upkint(&do_null2, 1, 1); pvm_upkint(&alphatype, 1, 1); SetAlphabet(alphatype); /* Open HMM file (maybe in HMMERDB) */ code = HMMPVM_OK; if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) code = HMMPVM_NO_HMMFILE; else if (hmmfp->gsi == NULL) code = HMMPVM_NO_INDEX; /* report our status. */ pvm_initsend(PvmDataDefault); pvm_pkint(&code, 1, 1); pvm_send(master_tid, HMMPVM_RESULTS); dsq = DigitizeSequence(seq, len); if (do_xnu) XNU(dsq, len); /***************************************************************** * Main loop. * Receive an integer 0..nhmm-1 for which HMM to search against. * If we receive a -1, we shut down. *****************************************************************/ for (;;) { pvm_recv(master_tid, HMMPVM_WORK); pvm_upkint(&nhmm, 1, 1); if (my_idx < 0) my_idx = nhmm; /* first time thru, remember what index we are. */ if (nhmm == -1) break; /* shutdown signal */ /* move to our assigned HMM in the HMM file, and read it */ HMMFilePositionByIndex(hmmfp, nhmm); if (! HMMFileRead(hmmfp, &hmm)) Die("unexpected end of HMM file"); if (hmm == NULL) Die("unexpected failure to parse HMM file"); P7Logoddsify(hmm, TRUE); /* Score sequence, do alignment (Viterbi), recover trace */ if (P7ViterbiSize(len, hmm->M) <= RAMLIMIT) { SQD_DPRINTF1(("P7Viterbi(): Estimated size %d Mb\n", P7ViterbiSize(len, hmm->M))); sc = P7Viterbi(dsq, len, hmm, &tr); } else { SQD_DPRINTF1(("P7SmallViterbi() called; %d Mb > %d\n", P7ViterbiSize(len, hmm->M), RAMLIMIT)); sc = P7SmallViterbi(dsq, len, hmm, &tr); } if (do_forward) sc = P7Forward(dsq, len, hmm, NULL); if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq); pvalue = PValue(hmm, sc); send_trace = (sc > globT && pvalue * (float) Z < globE) ? 1 : 0; /* return output */ pvm_initsend(PvmDataDefault); pvm_pkint(&my_idx, 1, 1); /* tell master who we are */ pvm_pkstr(hmm->name); /* double check that we did the right thing */ pvm_pkfloat(&sc, 1, 1); pvm_pkdouble(&pvalue, 1, 1); pvm_pkint(&send_trace, 1, 1); /* flag for whether a trace structure is coming */ if (send_trace) PVMPackTrace(tr); pvm_send(master_tid, HMMPVM_RESULTS); /* cleanup */ FreePlan7(hmm); P7FreeTrace(tr); } /*********************************************** * Cleanup, return. ***********************************************/ HMMFileClose(hmmfp); free(seq); free(dsq); free(hmmfile); return 0; }
_Trie::_Trie (const _String* alphabet) { SetAlphabet (alphabet, false); AppendNewInstance(new _SimpleList); payload << 0L; parents <<-1L; }
static void main_loop_serial(struct plan7_s *hmm, int seed, int nsample, float lenmean, float lensd, int fixedlen, struct histogram_s **ret_hist, float *ret_max, int& cancelFlag, int& progress) { struct histogram_s *hist; struct dpmatrix_s *mx; float randomseq[MAXABET]; float p1; float max; char *seq; unsigned char *dsq; float score; int sqlen; int idx; // Initialize. // We assume we've already set the alphabet (safe, because // HMM input sets the alphabet). sre_srandom(seed); //get HMMERTaskLocalData HMMERTaskLocalData *tls = getHMMERTaskLocalData(); alphabet_s &al = tls->al; SetAlphabet(hmm->atype); P7Logoddsify(hmm, TRUE); P7DefaultNullModel(randomseq, &p1); hist = AllocHistogram(-200, 200, 100); mx = CreatePlan7Matrix(1, hmm->M, 25, 0); max = -FLT_MAX; progress = 0; int pStub; for (idx = 0; idx < nsample && !cancelFlag; idx++) { // choose length of random sequence if (fixedlen) { sqlen = fixedlen; } else { do sqlen = (int) Gaussrandom(lenmean, lensd); while (sqlen < 1); } // generate it seq = RandomSequence(al.Alphabet, randomseq, al.Alphabet_size, sqlen); dsq = DigitizeSequence(seq, sqlen); if (P7ViterbiSpaceOK(sqlen, hmm->M, mx)) { score = P7Viterbi(dsq, sqlen, hmm, mx, NULL); } else { score = P7SmallViterbi(dsq, sqlen, hmm, mx, NULL, pStub); } AddToHistogram(hist, score); max = qMax(score, max); progress = int(100*idx/float(nsample)); free(dsq); free(seq); } FreePlan7Matrix(mx); *ret_hist = hist; *ret_max = max; }
int main(void) { int master_tid; /* PVM TID of our master */ int slaveidx; /* my slave index (0..nslaves-1) */ struct plan7_s *hmm; /* HMM to calibrate, sent from master */ struct histogram_s *hist; /* score histogram */ int hmmidx; /* index of this HMM */ char *seq; /* synthetic random sequence */ char *dsq; /* digitized seq */ int len; /* length of seq */ float sc; /* score of seq aligned to HMM */ float max; /* maximum score seen in sample */ int seed; /* random number seed */ int nsample; /* number of seqs to sample */ int fixedlen; /* if nonzero, fixed length of seq */ float lenmean; /* Gaussian mean length of seq */ float lensd; /* Gaussian length std. dev. for seq */ int fitok; /* TRUE if EVD fit was OK */ float randomseq[MAXABET]; /* iid frequencies of residues */ float p1; int alphatype; /* alphabet type, hmmAMINO or hmmNUCLEIC */ int idx; int code; /* Register leave_pvm() cleanup function so any exit() call * first calls pvm_exit(). */ if (atexit(leave_pvm) != 0) { pvm_exit(); Die("slave couldn't register leave_pvm()"); } /***************************************************************** * initialization. * Master broadcasts the problem to us: parameters of the * HMM calibration. ******************************************************************/ master_tid = pvm_parent(); /* who's our master? */ pvm_recv(master_tid, HMMPVM_INIT); pvm_upkint(&nsample, 1, 1); pvm_upkint(&fixedlen, 1, 1); pvm_upkfloat(&lenmean, 1, 1); pvm_upkfloat(&lensd, 1, 1); /* tell the master we're OK and ready to go (or not) */ code = HMMPVM_OK; pvm_initsend(PvmDataDefault); pvm_pkint(&code, 1, 1); pvm_send(master_tid, HMMPVM_RESULTS); /***************************************************************** * Main loop. * Receive a random number seed, then an HMM to search against. * If we receive a -1 seed, we shut down. *****************************************************************/ slaveidx = -1; for (;;) { pvm_recv(master_tid, HMMPVM_WORK); pvm_upkint(&seed, 1, 1); if (seed == -1) break; /* shutdown signal */ pvm_upkint(&hmmidx, 1, 1); pvm_upkint(&alphatype,1, 1); SetAlphabet(alphatype); hmm = PVMUnpackHMM(); if (hmm == NULL) Die("oh no, the HMM never arrived"); if (slaveidx == -1) slaveidx = hmmidx; P7DefaultNullModel(randomseq, &p1); sre_srandom(seed); P7Logoddsify(hmm, TRUE); hist = AllocHistogram(-200, 200, 100); max = -FLT_MAX; for (idx = 0; idx < nsample; idx++) { /* choose length of random sequence */ if (fixedlen) len = fixedlen; else do len = (int) Gaussrandom(lenmean, lensd); while (len < 1); /* generate it */ seq = RandomSequence(Alphabet, randomseq, Alphabet_size, len); dsq = DigitizeSequence(seq, len); if (P7ViterbiSize(len, hmm->M) <= RAMLIMIT) sc = P7Viterbi(dsq, len, hmm, NULL); else sc = P7SmallViterbi(dsq, len, hmm, NULL); AddToHistogram(hist, sc); if (sc > max) max = sc; free(seq); free(dsq); } /* Fit an EVD to the observed histogram. * The TRUE left-censors and fits only the right slope of the histogram. * The 9999. is an arbitrary high number that means we won't trim outliers * on the right. */ fitok = ExtremeValueFitHistogram(hist, TRUE, 9999.); /* Return output to master. * Currently we don't send the histogram back, but we could. */ pvm_initsend(PvmDataDefault); pvm_pkint(&slaveidx, 1, 1); pvm_pkint(&hmmidx, 1, 1); PVMPackString(hmm->name); pvm_pkint(&fitok, 1, 1); pvm_pkfloat(&(hist->param[EVD_MU]), 1, 1); pvm_pkfloat(&(hist->param[EVD_LAMBDA]), 1, 1); pvm_pkfloat(&max, 1, 1); pvm_send(master_tid, HMMPVM_RESULTS); /* cleanup */ FreeHistogram(hist); FreePlan7(hmm); } /*********************************************** * Cleanup, return. ***********************************************/ return 0; /* pvm_exit() is called by atexit() registration. */ }