void
P7ReadNullModel(char *rndfile, float *null, float *ret_p1)
{
  FILE *fp;
  char *s;
  int   x;
  int   type = 0; 

  if ((fp = fopen(rndfile, "r")) == NULL)
    Die("Failed to open null model file %s\n", rndfile);
  if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
  s2upper(s);
  if      (strcmp(s, "NUCLEIC") == 0) type = hmmNUCLEIC;
  else if (strcmp(s, "AMINO")   == 0) type = hmmAMINO;
  else    goto FAILURE;
				/* check/set alphabet type */
  if (Alphabet_type == 0) 
    SetAlphabet(type);
  else if (Alphabet_type != type)
    Die("Alphabet type conflict; null model in %s is inappropriate\n", rndfile);
				/* parse the file */
  for (x = 0; x < Alphabet_size; x++) {
    if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE;
    null[x] = atof(s);
  }
  if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE;
  *ret_p1 = atof(s);

  fclose(fp);
  return;

FAILURE:
  fclose(fp);
  Die("%s is not in HMMER null model file format", rndfile);
}
void HMMCreateWPoolTask::runUnsafe() {
    const UHMMCalibrateSettings& settings = pt->getSettings();
    WorkPool_s* wpool = pt->getWorkPool();

    SetAlphabet(wpool->hmm->atype);
    sre_srandom(settings.seed);

    wpool->fixedlen = settings.fixedlen;
    wpool->hist = AllocHistogram(-200, 200, 100);
    wpool->lenmean = settings.lenmean;
    wpool->lensd = settings.lensd;
    wpool->nsample = settings.nsample;
    wpool->nseq = 0;
    wpool->randomseq.resize(MAXABET);
    wpool->max_score = -FLT_MAX;

        
    float  p1;
    P7Logoddsify(wpool->hmm, TRUE);
    P7DefaultNullModel(wpool->randomseq.data(), &p1);
}
Beispiel #3
0
/* Function: DetermineAlphabet()
 * 
 * Purpose:  From a set of sequences (raw or aligned), make a good
 *           guess whether they're Nucleic, Amino, or something
 *           else, and set alphabet accordingly. 
 *           
 *           If Alphabet_type is already set, that means our
 *           autodetection was overridden from the command line, 
 *           and we just set the other globals accordingly.  
 */
void
DetermineAlphabet(char **rseqs, int  nseq)
{
  int idx;
  int other, nucleic, amino;
  int type;
  
  /* Autodetection of alphabet type.
   */
  type = hmmNOTSETYET;
  other = nucleic = amino = 0;
  for (idx = 0; idx < nseq; idx++) {
    switch (Seqtype(rseqs[idx])) {
    case kRNA:      nucleic++;   break;
    case kDNA:      nucleic++;   break;
    case kAmino:    amino++;     break;
    case kOtherSeq: other++;     break;
    default: Die("No such alphabet type");
    }
  }

  if      (nucleic == nseq) type = hmmNUCLEIC;
  else if (amino   == nseq) type = hmmAMINO;
  else if (nucleic > amino && nucleic > other) {
    Warn("Looks like nucleic acid sequence, hope that's right");
    type = hmmNUCLEIC;
  }
  else if (amino > nucleic && amino > other) {
    Warn("Looks like amino acid sequence, hope that's right");
    type = hmmAMINO;
  }
  else Die("Sorry, I can't tell if that's protein or DNA"); 

  /* Now set up the alphabet.
   */
  SetAlphabet(type);
}
Beispiel #4
0
void CAlnReader::SetPhylip(EAlphabet alpha)
{
    SetAlphabet(alpha);
    SetAllGap("-");
}
Beispiel #5
0
void CAlnReader::SetClustal(EAlphabet alpha)
{
    SetAlphabet(alpha);
    SetAllGap("-");
}
Beispiel #6
0
void CAlnReader::SetFastaGap(EAlphabet alpha)
{
    SetAlphabet(alpha);
    SetAllGap("-");
}
Beispiel #7
0
QList<UHMMSearchResult> UHMMSearch::search(plan7_s* _hmm, const char* seq, int seqLen, const UHMMSearchSettings& s, TaskStateInfo& si) 
{
    plan7_s * hmm = HMMIO::cloneHMM( _hmm );
    //Set up optional Pfam score thresholds. 
    threshold_s thresh;         // contains all threshold (cutoff) info
    thresh.globE   = s.globE; // use a reasonable Eval threshold
    thresh.globT   = -FLT_MAX;  // but no bit threshold
	thresh.domT    = s.domT;  // no domain bit threshold 
	thresh.domE    = s.domE;   // and no domain Eval threshold
    thresh.autocut = CUT_NONE;  // and no Pfam cutoffs used        
    thresh.Z       = s.eValueNSeqs; // Z not preset; use actual # of seqs 

    int   do_null2      = TRUE;    // TRUE to adjust scores with null model #2 
    int   do_forward    = FALSE;   // TRUE to use Forward() not Viterbi()      
    int   do_xnu        = FALSE;   // TRUE to filter sequences thru XNU        
    QList<UHMMSearchResult> res;   // the results of the method

    //get HMMERTaskLocalData
	HMMERTaskLocalData *tld = getHMMERTaskLocalData();
	alphabet_s *al = &tld->al;
	
    SetAlphabet(hmm->atype);

    P7Logoddsify(hmm, !do_forward); //TODO: clone model to avoid changes in it or make it thread safe??

    if (do_xnu && al->Alphabet_type == hmmNUCLEIC) {
        si.setError( "The HMM is a DNA model, and you can't use the --xnu filter on DNA data" );
        return res;
    }

    /*****************************************************************
    * Set up optional Pfam score thresholds. 
    * Can do this before starting any searches, since we'll only use 1 HMM.
    *****************************************************************/ 

    if (!SetAutocuts(&thresh, hmm)) {
        si.setError(  "HMM did not contain the GA, TC, or NC cutoffs you needed" );
        return res;
    }

    // set up structures for storing output
    histogram_s *histogram  = AllocHistogram(-200, 200, 100);  //keeps full histogram of all scores
    tophit_s   *ghit        = AllocTophits(200);               // per-seq hits: 200=lumpsize
    tophit_s   *dhit        = AllocTophits(200);               // domain hits:  200=lumpsize
    
    int     nseq = 0;         // number of sequences searched   
#ifdef UGENE_CELL
    if( HMMSearchAlgo_CellOptimized == s.alg ) {
        if( hmm->M < MAX_HMM_LENGTH ) {
            main_loop_spe(hmm, seq, seqLen, &thresh, do_forward, do_null2, do_xnu, histogram, ghit, dhit, &nseq, si);
        } else {
            main_loop_serial(hmm, seq, seqLen, &thresh, do_forward, do_null2, do_xnu, histogram, ghit, dhit, &nseq, si);
        }
    } else
#elif defined(HMMER_BUILD_WITH_SSE2)
    if( HMMSearchAlgo_SSEOptimized == s.alg ) {
        main_loop_opt(hmm, seq, seqLen, &thresh, do_forward, do_null2, do_xnu, histogram, ghit, dhit, &nseq, si, sseScoring);
    } else
#endif
    if( HMMSearchAlgo_Conservative == s.alg ) {
        main_loop_serial(hmm, seq, seqLen, &thresh, do_forward, do_null2, do_xnu, histogram, ghit, dhit, &nseq, si);
    }
    else {
        assert( false && "bad hmmsearch algorithm selected" );
    }
    // Process hit lists, produce text output

    // Set the theoretical EVD curve in our histogram using calibration in the HMM, if available. 
    if (hmm->flags & PLAN7_STATS) {
        ExtremeValueSetHistogram(histogram, hmm->mu, hmm->lambda, histogram->lowscore, histogram->highscore, 0);
    }
    if (!thresh.Z) {
        thresh.Z = nseq;       // set Z for good now that we're done
    }

    //report our output 

    FullSortTophits(dhit);

    //int namewidth = MAX(8, TophitsMaxName(ghit)); // max width of sequence name

    // Report domain hits (sorted on E-value)
    for (int i = 0; i < dhit->num && !si.cancelFlag; i++) {
        float   sc;                 // score of an HMM search                
        double  pvalue;             // pvalue of an HMM score
        double  evalue;             // evalue of an HMM score
        char    *name, *desc;       // hit sequence name and description
        double  motherp;            // pvalue of a whole seq HMM score
        float   mothersc;           // score of a whole seq parent of domain 
        int     sqfrom, sqto;       // coordinates in sequence                
        int     sqlen;              // length of seq that was hit
        int     hmmfrom, hmmto;     // coordinate in HMM                      
        int     ndom;               // total # of domains in this seq   
        int     domidx;             // number of this domain 

        GetRankedHit(dhit, i, &pvalue, &sc, &motherp, &mothersc,
                    &name, NULL, &desc,
                    &sqfrom, &sqto, &sqlen,      // seq position info
                    &hmmfrom, &hmmto, NULL,      // HMM position info 
                    &domidx, &ndom,              // domain info
                    NULL);                       // alignment info     

        evalue = pvalue * (double) thresh.Z;
        
        if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT)  {
            continue;
        } else if (evalue <= thresh.domE && sc >= thresh.domT) {
            // hmm reports results in range [1...N] -> translate it to [0..N)
            res.append(UHMMSearchResult(U2Region(sqfrom-1, sqto-sqfrom+1), sc, evalue));
        }
    }

    //Clean-up and exit.
    FreeHistogram(histogram);
    FreeTophits(ghit);
    FreeTophits(dhit);
	FreePlan7( hmm );
    
    return res;
}
Beispiel #8
0
int 
main(void)
{
  struct p7trace_s *tr;         /* traceback of an alignment               */
  int      master_tid;		/* PVM TID of our master */
  char    *hmmfile;	        /* file to read HMM(s) from                */
  HMMFILE *hmmfp;               /* opened hmmfile for reading              */
  struct plan7_s *hmm;
  char    *seq;
  char    *dsq;
  int      len;
  int      nhmm;		/* number of HMM to work on                */
  float    sc;
  int      my_idx = -1;		/* my index, 0..nslaves-1 */
  float    globT;		/* T parameter: keep only hits > globT bits */
  double   globE;		/* E parameter: keep hits < globE E-value   */
  double   pvalue;		/* Z*pvalue = Evalue                        */
  int      Z;			/* nseq to base E value calculation on      */
  int      send_trace;		/* TRUE if score is significant             */
  int      do_xnu;		/* TRUE to do XNU filter on seq             */
  int      do_forward;		/* TRUE to use Forward() scores not Viterbi */
  int      do_null2;		/* TRUE to correct scores w/ ad hoc null2   */
  int      alphatype;		/* alphabet type, hmmAMINO or hmmNUCLEIC    */
  int      code;		/* return code after initialization         */

  
  /* Register leave_pvm() cleanup function so any exit() call
   * first calls pvm_exit().
   */
  if (atexit(leave_pvm) != 0) { pvm_exit(); Die("slave couldn't register leave_pvm()"); }

  /*****************************************************************
   * initialization.
   * Master broadcasts to us: 
   *     1) len of HMM file name        (int)
   *     2) name of HMM file            (string)
   *     3) length of sequence string   (int) 
   *     4) sequence                    (string)
   *     5) globT threshold
   *     6) globE threshold
   *     7) Z 
   *     8) do_xnu flag
   *     9) do_forward flag
   *    10) do_null2 flag
   *    11) alphabet type
   * We receive the broadcast and open the files.    
   ******************************************************************/

  master_tid = pvm_parent();	/* who's our master? */

  pvm_recv(master_tid, HMMPVM_INIT);
  pvm_upkint(&len, 1, 1);
  hmmfile = MallocOrDie(sizeof(char *) * (len+1));
  pvm_upkstr(hmmfile);
  pvm_upkint(&len, 1, 1);
  seq = MallocOrDie(sizeof(char *) * (len+1));
  pvm_upkstr(seq);
  pvm_upkfloat(&globT, 1, 1);
  pvm_upkdouble(&globE, 1, 1);
  pvm_upkint(&Z, 1, 1);
  pvm_upkint(&do_xnu, 1, 1);
  pvm_upkint(&do_forward, 1, 1);
  pvm_upkint(&do_null2, 1, 1);
  pvm_upkint(&alphatype, 1, 1);

  SetAlphabet(alphatype);
				/* Open HMM file (maybe in HMMERDB) */
  code = HMMPVM_OK;
  if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
    code = HMMPVM_NO_HMMFILE;
  else if (hmmfp->gsi == NULL)
    code = HMMPVM_NO_INDEX;
  
  /* report our status.
   */
  pvm_initsend(PvmDataDefault);
  pvm_pkint(&code, 1, 1);	
  pvm_send(master_tid, HMMPVM_RESULTS);

  dsq = DigitizeSequence(seq, len);
  if (do_xnu) XNU(dsq, len);

  /*****************************************************************
   * Main loop.
   * Receive an integer 0..nhmm-1 for which HMM to search against.
   * If we receive a -1, we shut down. 
   *****************************************************************/ 
  
  for (;;) 
    {
      pvm_recv(master_tid, HMMPVM_WORK);
      pvm_upkint(&nhmm, 1, 1);
      if (my_idx < 0) my_idx = nhmm; /* first time thru, remember what index we are. */

      if (nhmm == -1) break;	/* shutdown signal */

      /* move to our assigned HMM in the HMM file, and read it
       */
      HMMFilePositionByIndex(hmmfp, nhmm);
      if (! HMMFileRead(hmmfp, &hmm)) Die("unexpected end of HMM file"); 
      if (hmm == NULL)                Die("unexpected failure to parse HMM file"); 
      P7Logoddsify(hmm, TRUE);
      
      /* Score sequence, do alignment (Viterbi), recover trace
       */
      if (P7ViterbiSize(len, hmm->M) <= RAMLIMIT)
	{
	  SQD_DPRINTF1(("P7Viterbi(): Estimated size %d Mb\n", P7ViterbiSize(len, hmm->M)));
	  sc = P7Viterbi(dsq, len, hmm, &tr);
	}
      else
	{
	  SQD_DPRINTF1(("P7SmallViterbi() called; %d Mb > %d\n", P7ViterbiSize(len, hmm->M), RAMLIMIT));
	  sc = P7SmallViterbi(dsq, len, hmm, &tr);
	}

      if (do_forward) sc  = P7Forward(dsq, len, hmm, NULL);
      if (do_null2)   sc -= TraceScoreCorrection(hmm, tr, dsq);
	
      pvalue = PValue(hmm, sc);
      send_trace = (sc > globT && pvalue * (float) Z < globE) ? 1 : 0;

      /* return output
       */
      pvm_initsend(PvmDataDefault);
      pvm_pkint(&my_idx, 1, 1);	/* tell master who we are */
      pvm_pkstr(hmm->name);	/* double check that we did the right thing */
      pvm_pkfloat(&sc, 1, 1);
      pvm_pkdouble(&pvalue, 1, 1);
      pvm_pkint(&send_trace, 1, 1); /* flag for whether a trace structure is coming */
      if (send_trace) PVMPackTrace(tr);
      pvm_send(master_tid, HMMPVM_RESULTS);

      /* cleanup
       */
      FreePlan7(hmm);
      P7FreeTrace(tr);
    }

  /*********************************************** 
   * Cleanup, return.
   ***********************************************/

  HMMFileClose(hmmfp);
  free(seq);
  free(dsq);
  free(hmmfile);
  return 0;
}
Beispiel #9
0
_Trie::_Trie (const _String* alphabet) {
    SetAlphabet (alphabet, false);
    AppendNewInstance(new _SimpleList);
    payload << 0L;
    parents <<-1L;
}
Beispiel #10
0
static void main_loop_serial(struct plan7_s *hmm, int seed, int nsample, 
                            float lenmean, float lensd, int fixedlen,
                            struct histogram_s **ret_hist, float *ret_max, int& cancelFlag, int& progress)
{
    struct histogram_s *hist;
    struct dpmatrix_s  *mx;
    float  randomseq[MAXABET];
    float  p1;
    float  max;
    char  *seq;
    unsigned char  *dsq;
    float  score;
    int    sqlen;
    int    idx;

    // Initialize.
    // We assume we've already set the alphabet (safe, because
    // HMM input sets the alphabet).
    
    sre_srandom(seed);

	//get HMMERTaskLocalData
	HMMERTaskLocalData *tls = getHMMERTaskLocalData();
    alphabet_s &al = tls->al;
	
    SetAlphabet(hmm->atype);

    P7Logoddsify(hmm, TRUE);
    P7DefaultNullModel(randomseq, &p1);
    hist = AllocHistogram(-200, 200, 100);
    mx = CreatePlan7Matrix(1, hmm->M, 25, 0);
    max = -FLT_MAX;

    progress = 0;
    int pStub;
    
    for (idx = 0; idx < nsample && !cancelFlag; idx++) {
        // choose length of random sequence
        if (fixedlen) {
            sqlen = fixedlen;
        } else {
            do sqlen = (int) Gaussrandom(lenmean, lensd); while (sqlen < 1);
        }
        // generate it
        seq = RandomSequence(al.Alphabet, randomseq, al.Alphabet_size, sqlen);
        dsq = DigitizeSequence(seq, sqlen);

        if (P7ViterbiSpaceOK(sqlen, hmm->M, mx)) {
            score = P7Viterbi(dsq, sqlen, hmm, mx, NULL);
        } else {
            score = P7SmallViterbi(dsq, sqlen, hmm, mx, NULL, pStub);
        }
    
        AddToHistogram(hist, score);
        max = qMax(score, max);

        progress = int(100*idx/float(nsample));

        free(dsq); 
        free(seq);
    }

    FreePlan7Matrix(mx);
    *ret_hist   = hist;
    *ret_max    = max;
}
Beispiel #11
0
int 
main(void)
{
  int      master_tid;		/* PVM TID of our master */
  int      slaveidx;		/* my slave index (0..nslaves-1) */
  struct plan7_s *hmm;		/* HMM to calibrate, sent from master */
  struct histogram_s *hist;     /* score histogram */
  int      hmmidx;		/* index of this HMM */
  char    *seq;			/* synthetic random sequence */
  char    *dsq;			/* digitized seq */
  int      len;			/* length of seq */
  float    sc;			/* score of seq aligned to HMM */
  float    max;			/* maximum score seen in sample */
  int      seed;		/* random number seed */
  int      nsample;		/* number of seqs to sample */
  int      fixedlen;		/* if nonzero, fixed length of seq */
  float    lenmean;		/* Gaussian mean length of seq */
  float    lensd;		/* Gaussian length std. dev. for seq */
  int      fitok;		/* TRUE if EVD fit was OK */
  float    randomseq[MAXABET];	/* iid frequencies of residues */
  float    p1;
  int      alphatype;		/* alphabet type, hmmAMINO or hmmNUCLEIC    */
  int      idx;
  int      code;

  /* Register leave_pvm() cleanup function so any exit() call
   * first calls pvm_exit().
   */
  if (atexit(leave_pvm) != 0) { pvm_exit(); Die("slave couldn't register leave_pvm()"); }

  /*****************************************************************
   * initialization.
   * Master broadcasts the problem to us: parameters of the
   * HMM calibration.  
   ******************************************************************/

  master_tid = pvm_parent();	/* who's our master? */

  pvm_recv(master_tid, HMMPVM_INIT);
  pvm_upkint(&nsample,  1, 1);
  pvm_upkint(&fixedlen, 1, 1);
  pvm_upkfloat(&lenmean,  1, 1);
  pvm_upkfloat(&lensd,    1, 1);

  /* tell the master we're OK and ready to go (or not)
   */
  code = HMMPVM_OK;
  pvm_initsend(PvmDataDefault);
  pvm_pkint(&code, 1, 1);	
  pvm_send(master_tid, HMMPVM_RESULTS);

  /*****************************************************************
   * Main loop.
   * Receive a random number seed, then an HMM to search against.
   * If we receive a -1 seed, we shut down. 
   *****************************************************************/ 
  
  slaveidx = -1;
  for (;;) 
    {
      pvm_recv(master_tid, HMMPVM_WORK);
      pvm_upkint(&seed, 1, 1);
      if (seed == -1) break;	/* shutdown signal */
      pvm_upkint(&hmmidx, 1, 1);
      pvm_upkint(&alphatype,1, 1);
      SetAlphabet(alphatype);
      hmm = PVMUnpackHMM();
      if (hmm == NULL) Die("oh no, the HMM never arrived");

      if (slaveidx == -1) slaveidx = hmmidx; 
      P7DefaultNullModel(randomseq, &p1);

      sre_srandom(seed);
      P7Logoddsify(hmm, TRUE);
      hist = AllocHistogram(-200, 200, 100);
      max  = -FLT_MAX;

      for (idx = 0; idx < nsample; idx++)
	{
  				/* choose length of random sequence */
	  if (fixedlen) len = fixedlen;
	  else do len = (int) Gaussrandom(lenmean, lensd); while (len < 1);
				/* generate it */
	  seq = RandomSequence(Alphabet, randomseq, Alphabet_size, len);
	  dsq = DigitizeSequence(seq, len);

	  if (P7ViterbiSize(len, hmm->M) <= RAMLIMIT)
	    sc = P7Viterbi(dsq, len, hmm, NULL);
	  else
	    sc = P7SmallViterbi(dsq, len, hmm, NULL);

	  AddToHistogram(hist, sc);
	  if (sc > max) max = sc;
	  
	  free(seq);
	  free(dsq);
	}

      /* Fit an EVD to the observed histogram.
       * The TRUE left-censors and fits only the right slope of the histogram.
       * The 9999. is an arbitrary high number that means we won't trim outliers
       * on the right.
       */
      fitok = ExtremeValueFitHistogram(hist, TRUE, 9999.);

      /* Return output to master.
       * Currently we don't send the histogram back, but we could.
       */
      pvm_initsend(PvmDataDefault);
      pvm_pkint(&slaveidx, 1, 1);
      pvm_pkint(&hmmidx, 1, 1);	
      PVMPackString(hmm->name);
      pvm_pkint(&fitok,  1, 1);
      pvm_pkfloat(&(hist->param[EVD_MU]), 1, 1);
      pvm_pkfloat(&(hist->param[EVD_LAMBDA]), 1, 1);
      pvm_pkfloat(&max, 1, 1);
      pvm_send(master_tid, HMMPVM_RESULTS);

      /* cleanup
       */
      FreeHistogram(hist);
      FreePlan7(hmm);
    }

  /*********************************************** 
   * Cleanup, return.
   ***********************************************/

  return 0;			/* pvm_exit() is called by atexit() registration. */
}