Beispiel #1
0
static void
main_loop_serial(struct plan7_s *hmm, const char* seq, int seqLen, struct threshold_s *thresh, int do_forward,
                 int do_null2, int do_xnu, struct histogram_s *histogram, struct tophit_s *ghit, struct tophit_s *dhit, 
                 int *ret_nseq, TaskStateInfo& ti)
{
	//get HMMERTaskLocalData
	HMMERTaskLocalData *tld = getHMMERTaskLocalData();
	alphabet_s *al = &tld->al;

    struct dpmatrix_s *mx;      // DP matrix, growable
    struct p7trace_s *tr;       // traceback
    unsigned char   *dsq;       // digitized target sequence
    float  sc;                  // score of an HMM search
    double pvalue;              // pvalue of an HMM score
    double evalue;              // evalue of an HMM score

    // Create a DP matrix; initially only two rows big, but growable;
    // we overalloc by 25 rows (L dimension) when we grow; not growable
    // in model dimension, since we know the hmm size
    mx = CreatePlan7Matrix(1, hmm->M, 25, 0); 

    assert(seqLen > 0);

    dsq = DigitizeSequence(seq, seqLen);

    if (do_xnu && al->Alphabet_type == hmmAMINO) {
        XNU(dsq, seqLen);
    }

    // 1. Recover a trace by Viterbi.
    //    In extreme cases, the alignment may be literally impossible;
    //    in which case, the score comes out ridiculously small (but not
    //    necessarily <= -INFTY, because we're not terribly careful
    //    about underflow issues), and tr will be returned as NULL.
    if (P7ViterbiSpaceOK(seqLen, hmm->M, mx)) {
        sc = P7Viterbi(dsq, seqLen, hmm, mx, &tr);
    } else {
        sc = P7SmallViterbi(dsq, seqLen, hmm, mx, &tr, ti.progress);
    }

    // 2. If we're using Forward scores, calculate the
    //    whole sequence score; this overrides anything
    //    PostprocessSignificantHit() is going to do to the per-seq score.
    if (do_forward) {
        sc  = P7Forward(dsq, seqLen, hmm, NULL);
        if (do_null2)   sc -= TraceScoreCorrection(hmm, tr, dsq); 
    }

    // 2. Store score/pvalue for global alignment; will sort on score,
    //    which in hmmsearch is monotonic with E-value. 
    //    Keep all domains in a significant sequence hit.
    //    We can only make a lower bound estimate of E-value since
    //    we don't know the final value of nseq yet, so the list
    //    of hits we keep in memory is >= the list we actually
    //    output. 
    //
    pvalue = PValue(hmm, sc);
    evalue = thresh->Z ? (double) thresh->Z * pvalue : (double) pvalue;
    if (sc >= thresh->globT && evalue <= thresh->globE)  {
        sc = PostprocessSignificantHit(ghit, dhit, 
            tr, hmm, dsq, seqLen,
			(char *)"sequence", //todo: sqinfo.name, 
            NULL, 
            NULL, 
            do_forward, sc,
            do_null2,
            thresh,
            FALSE); // FALSE-> not hmmpfam mode, hmmsearch mode
    }
    AddToHistogram(histogram, sc);
    P7FreeTrace(tr);
    free(dsq);

    FreePlan7Matrix(mx);
    return;
}
Beispiel #2
0
/* Function: EmitSequence()
 * Date:     SRE, Sun Mar  8 12:28:03 1998 [St. Louis]
 *
 * Purpose:  Given a model, sample a sequence and/or traceback.
 *
 * Args:     hmm     - the model
 *           ret_dsq - RETURN: generated digitized sequence (pass NULL if unwanted)
 *           ret_L   - RETURN: length of generated sequence 
 *           ret_tr  - RETURN: generated trace (pass NULL if unwanted)
 *
 * Returns:  void
 */
void
EmitSequence(struct plan7_s *hmm, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr)
{
  struct p7trace_s *tr;
  enum   p7stype    type;	/* current state type */
  int   k;			/* current node index */
  char *dsq;                    /* generated sequence, digitized */
  int   L;			/* length of sequence */
  int   alloc_tlen;		/* allocated space for traceback */
  int   alloc_L;		/* allocated space for sequence  */
  int   tpos;			/* position in traceback */
  int   sym;			/* a generated symbol index */
  float t[4];			/* little array for choosing M transition from */
  
  /* Initialize; allocations
   */
  P7AllocTrace(64, &tr);
  alloc_tlen = 64;
  dsq = MallocOrDie(sizeof(char) * 64);
  alloc_L = 64;

  TraceSet(tr, 0, STS, 0, 0);
  TraceSet(tr, 1, STN, 0, 0);
  dsq[0] = (char) Alphabet_iupac;
  L      = 1;
  k      = 0;
  type   = STN;
  tpos   = 2;

  while (type != STT) 
    {
      /* Deal with state transition
       */
      switch (type) {
      case STB:	type = STM; k = FChoose(hmm->begin+1, hmm->M) + 1; break;
      case STI:	type = (FChoose(hmm->t[k]+TIM, 2) == 0)    ? STM : STI; if (type == STM) k++; break;
      case STN: type = (FChoose(hmm->xt[XTN], 2)  == LOOP) ? STN : STB; k = 0; break;
      case STE:	type = (FChoose(hmm->xt[XTE], 2)  == LOOP) ? STJ : STC; k = 0; break;
      case STC:	type = (FChoose(hmm->xt[XTC], 2)  == LOOP) ? STC : STT; k = 0; break;
      case STJ:	type = (FChoose(hmm->xt[XTJ], 2)  == LOOP) ? STJ : STB; k = 0; break;

      case STD:	
	if (k < hmm->M) {
	  type = (FChoose(hmm->t[k]+TDM, 2) == 0) ? STM : STD; 
	  k++;   
	} else {
	  type = STE;
	  k = 0;
	}
	break;

      case STM:
	if (k < hmm->M) {
	  FCopy(t, hmm->t[k], 3);
	  t[3] = hmm->end[k];
	  switch (FChoose(t,4)) {
	  case 0: k++;  type = STM; break;
	  case 1:       type = STI; break;
	  case 2: k++;  type = STD; break;
	  case 3: k=0;  type = STE; break;
	  default: Die("never happens");
	  }
	} else {
	  k    = 0;
	  type = STE;
	}
	break;

      case STT:
      case STBOGUS:
      default:
	Die("can't happen.");
      }
  
      /* Choose a symbol emission, if necessary
       */
      sym = -1;
      if      (type == STM) sym = FChoose(hmm->mat[k], Alphabet_size);
      else if (type == STI) sym = FChoose(hmm->ins[k], Alphabet_size); 
      else if ((type == STN && tr->statetype[tpos-1] == STN) ||
	       (type == STC && tr->statetype[tpos-1] == STC) ||
	       (type == STJ && tr->statetype[tpos-1] == STJ))
	sym = FChoose(hmm->null, Alphabet_size);
	
      /* Add to the traceback; deal with realloc if necessary
       */
      TraceSet(tr, tpos, type, k, (sym != -1) ? L : 0);
      tpos++;
      if (tpos == alloc_tlen) {
	alloc_tlen += 64; 
	P7ReallocTrace(tr, alloc_tlen);
      }

      /* Add to the digitized seq; deal with realloc, if necessary
       */
      if (sym != -1) {
	dsq[L] = (char) sym;
	L++;
	if (L+1 == alloc_L) {	/* L+1 leaves room for sentinel byte + \0 */
	  alloc_L += 64;
	  dsq = ReallocOrDie(dsq, sizeof(char) * alloc_L);
	}
      }
    }
  
  /* Finish off the trace
   */ 
  tr->tlen = tpos;

  /* Finish off the dsq with sentinel byte and null terminator.
   * Emitted Sequence length is L-1.
   */
  dsq[L]   = (char) Alphabet_iupac;
  dsq[L+1] = '\0';
  L--;

  /* Return
   */
  if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);
  if (ret_L   != NULL) *ret_L   = L;
  if (ret_tr  != NULL) *ret_tr  = tr;  else P7FreeTrace(tr);
  return;
}
Beispiel #3
0
int 
main(void)
{
  struct p7trace_s *tr;         /* traceback of an alignment               */
  int      master_tid;		/* PVM TID of our master */
  char    *hmmfile;	        /* file to read HMM(s) from                */
  HMMFILE *hmmfp;               /* opened hmmfile for reading              */
  struct plan7_s *hmm;
  char    *seq;
  char    *dsq;
  int      len;
  int      nhmm;		/* number of HMM to work on                */
  float    sc;
  int      my_idx = -1;		/* my index, 0..nslaves-1 */
  float    globT;		/* T parameter: keep only hits > globT bits */
  double   globE;		/* E parameter: keep hits < globE E-value   */
  double   pvalue;		/* Z*pvalue = Evalue                        */
  int      Z;			/* nseq to base E value calculation on      */
  int      send_trace;		/* TRUE if score is significant             */
  int      do_xnu;		/* TRUE to do XNU filter on seq             */
  int      do_forward;		/* TRUE to use Forward() scores not Viterbi */
  int      do_null2;		/* TRUE to correct scores w/ ad hoc null2   */
  int      alphatype;		/* alphabet type, hmmAMINO or hmmNUCLEIC    */
  int      code;		/* return code after initialization         */

  
  /* Register leave_pvm() cleanup function so any exit() call
   * first calls pvm_exit().
   */
  if (atexit(leave_pvm) != 0) { pvm_exit(); Die("slave couldn't register leave_pvm()"); }

  /*****************************************************************
   * initialization.
   * Master broadcasts to us: 
   *     1) len of HMM file name        (int)
   *     2) name of HMM file            (string)
   *     3) length of sequence string   (int) 
   *     4) sequence                    (string)
   *     5) globT threshold
   *     6) globE threshold
   *     7) Z 
   *     8) do_xnu flag
   *     9) do_forward flag
   *    10) do_null2 flag
   *    11) alphabet type
   * We receive the broadcast and open the files.    
   ******************************************************************/

  master_tid = pvm_parent();	/* who's our master? */

  pvm_recv(master_tid, HMMPVM_INIT);
  pvm_upkint(&len, 1, 1);
  hmmfile = MallocOrDie(sizeof(char *) * (len+1));
  pvm_upkstr(hmmfile);
  pvm_upkint(&len, 1, 1);
  seq = MallocOrDie(sizeof(char *) * (len+1));
  pvm_upkstr(seq);
  pvm_upkfloat(&globT, 1, 1);
  pvm_upkdouble(&globE, 1, 1);
  pvm_upkint(&Z, 1, 1);
  pvm_upkint(&do_xnu, 1, 1);
  pvm_upkint(&do_forward, 1, 1);
  pvm_upkint(&do_null2, 1, 1);
  pvm_upkint(&alphatype, 1, 1);

  SetAlphabet(alphatype);
				/* Open HMM file (maybe in HMMERDB) */
  code = HMMPVM_OK;
  if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
    code = HMMPVM_NO_HMMFILE;
  else if (hmmfp->gsi == NULL)
    code = HMMPVM_NO_INDEX;
  
  /* report our status.
   */
  pvm_initsend(PvmDataDefault);
  pvm_pkint(&code, 1, 1);	
  pvm_send(master_tid, HMMPVM_RESULTS);

  dsq = DigitizeSequence(seq, len);
  if (do_xnu) XNU(dsq, len);

  /*****************************************************************
   * Main loop.
   * Receive an integer 0..nhmm-1 for which HMM to search against.
   * If we receive a -1, we shut down. 
   *****************************************************************/ 
  
  for (;;) 
    {
      pvm_recv(master_tid, HMMPVM_WORK);
      pvm_upkint(&nhmm, 1, 1);
      if (my_idx < 0) my_idx = nhmm; /* first time thru, remember what index we are. */

      if (nhmm == -1) break;	/* shutdown signal */

      /* move to our assigned HMM in the HMM file, and read it
       */
      HMMFilePositionByIndex(hmmfp, nhmm);
      if (! HMMFileRead(hmmfp, &hmm)) Die("unexpected end of HMM file"); 
      if (hmm == NULL)                Die("unexpected failure to parse HMM file"); 
      P7Logoddsify(hmm, TRUE);
      
      /* Score sequence, do alignment (Viterbi), recover trace
       */
      if (P7ViterbiSize(len, hmm->M) <= RAMLIMIT)
	{
	  SQD_DPRINTF1(("P7Viterbi(): Estimated size %d Mb\n", P7ViterbiSize(len, hmm->M)));
	  sc = P7Viterbi(dsq, len, hmm, &tr);
	}
      else
	{
	  SQD_DPRINTF1(("P7SmallViterbi() called; %d Mb > %d\n", P7ViterbiSize(len, hmm->M), RAMLIMIT));
	  sc = P7SmallViterbi(dsq, len, hmm, &tr);
	}

      if (do_forward) sc  = P7Forward(dsq, len, hmm, NULL);
      if (do_null2)   sc -= TraceScoreCorrection(hmm, tr, dsq);
	
      pvalue = PValue(hmm, sc);
      send_trace = (sc > globT && pvalue * (float) Z < globE) ? 1 : 0;

      /* return output
       */
      pvm_initsend(PvmDataDefault);
      pvm_pkint(&my_idx, 1, 1);	/* tell master who we are */
      pvm_pkstr(hmm->name);	/* double check that we did the right thing */
      pvm_pkfloat(&sc, 1, 1);
      pvm_pkdouble(&pvalue, 1, 1);
      pvm_pkint(&send_trace, 1, 1); /* flag for whether a trace structure is coming */
      if (send_trace) PVMPackTrace(tr);
      pvm_send(master_tid, HMMPVM_RESULTS);

      /* cleanup
       */
      FreePlan7(hmm);
      P7FreeTrace(tr);
    }

  /*********************************************** 
   * Cleanup, return.
   ***********************************************/

  HMMFileClose(hmmfp);
  free(seq);
  free(dsq);
  free(hmmfile);
  return 0;
}
Beispiel #4
0
int main(int argc, char **argv) 
{
    const char      *hmmfile;	/* file to read HMMs from                  */
    FILE            *fp;	/* output file handle                      */
    HMMFILE         *hmmfp;	/* opened hmmfile for reading              */
    struct plan7_s  *hmm;	/* HMM to generate from                    */
    int              L;		/* length of a sequence                    */
    int              i;		/* counter over sequences                  */

    char            *ofile;	/* output sequence file                    */
    int              nseq;	/* number of seqs to sample                */
    int              seed;	/* random number generator seed            */
    int              be_quiet;	/* TRUE to silence header/footer           */
    int              do_alignment; /* TRUE to output in aligned format     */ 
    int              do_consensus; /* TRUE to do a single consensus seq    */

    AjBool ajselex;
    AjBool ajcons;
    AjPFile inf=NULL;
    AjPFile outf=NULL;
    AjPStr  instr=NULL;
    AjPStr  outstr=NULL;
  

#ifdef MEMDEBUG
    unsigned long histid1, histid2, orig_size, current_size;
    orig_size = malloc_inuse(&histid1);
    fprintf(stderr, "[... memory debugging is ON ...]\n");
#endif

    /*********************************************** 
     * Parse command line
     ***********************************************/

    nseq         = 10;

    be_quiet     = FALSE;
    do_alignment = FALSE;  
    do_consensus = FALSE;
    ofile        = NULL;

    embInitPV("ohmmemit",argc,argv,"HMMER",VERSION);

    ajselex = ajAcdGetBoolean("selex");
    ajcons  = ajAcdGetBoolean("consensus");
    nseq    = ajAcdGetInt("number");
    seed    = ajAcdGetInt("seed");
    inf     = ajAcdGetInfile("infile");
    outf    = ajAcdGetOutfile("outfile");
  
    if(!seed)
	seed = time ((time_t *) NULL);

    if(ajselex)
	do_alignment=TRUE;
    else
	do_alignment=FALSE;
  
    if(ajcons)
	do_consensus=TRUE;
    else
	do_consensus=FALSE;

    instr  = ajStrNewC((char *)ajFileGetNameC(inf));
    outstr = ajStrNewC((char *)ajFileGetNameC(outf));

    hmmfile = ajStrGetPtr(instr);

    sre_srandom(seed);

    if (do_alignment && do_consensus)
	ajFatal("Sorry, -selex and -consensus are incompatible.\n"); 
    if (nseq != 10 && do_consensus)
	ajWarn("-consensus overrides -number (# of sampled seqs)");

    /*********************************************** 
     * Open HMM file (might be in HMMERDB or current directory).
     * Read a single HMM from it.
     ***********************************************/

    if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
	ajFatal("Failed to open HMM file %s\n", hmmfile);
    if (!HMMFileRead(hmmfp, &hmm)) 
	ajFatal("Failed to read any HMMs from %s\n", hmmfile);
    HMMFileClose(hmmfp);
    if (hmm == NULL) 
	ajFatal("HMM file %s corrupt or in incorrect format? Parse failed",
		hmmfile);

    /* Configure the HMM to shut off N,J,C emission: so we
     * do a simple single pass through the model.
     */
    Plan7NakedConfig(hmm);
    Plan7Renormalize(hmm);

    /*********************************************** 
     * Open the output file, or stdout
     ***********************************************/ 

    fp = ajFileGetFileptr(outf);
  
 
    /*********************************************** 
     * Show the options banner
     ***********************************************/
    be_quiet=TRUE;
    if (! be_quiet) 
    {
	printf("HMM file:             %s\n", hmmfile);
	if (! do_consensus)
	{
	    printf("Number of seqs:       %d\n", nseq);
	    printf("Random seed:          %d\n", seed);
	}
	printf("- - - - - - - - - - - - - - - - - - - - - - - - - "
	       "- - - - - - -\n\n");
    }

    /*********************************************** 
     * Do the work.
     * If we're generating an alignment, we have to collect
     * all our traces, then output. If we're generating unaligned
     * sequences, we can emit one at a time.
     ***********************************************/

    if (do_consensus) 
    {
	char    *seq;
	SQINFO   sqinfo;	/* info about sequence (name/desc)        */

	EmitConsensusSequence(hmm, &seq, NULL, &L, NULL);
	strcpy(sqinfo.name, "consensus");
	sqinfo.len = L;
	sqinfo.flags = SQINFO_NAME | SQINFO_LEN;

	WriteSeq(fp, kPearson, seq, &sqinfo);
	free(seq);
    }
    else if (do_alignment)
    {
	struct p7trace_s **tr;
	char           **dsq;
	SQINFO          *sqinfo;
	char           **aseq;
	AINFO            ainfo;
	float           *wgt;

	dsq    = MallocOrDie(sizeof(char *)             * nseq);
	tr     = MallocOrDie(sizeof(struct p7trace_s *) * nseq);
	sqinfo = MallocOrDie(sizeof(SQINFO)             * nseq);
	wgt    = MallocOrDie(sizeof(float)              * nseq);
	FSet(wgt, nseq, 1.0);

	for (i = 0; i < nseq; i++)
	{
	    EmitSequence(hmm, &(dsq[i]), &L, &(tr[i]));
	    sprintf(sqinfo[i].name, "seq%d", i+1);
	    sqinfo[i].len   = L;
	    sqinfo[i].flags = SQINFO_NAME | SQINFO_LEN;
	}

	P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, FALSE, 
			   &aseq, &ainfo);

	/* Output the alignment */
	WriteSELEX(fp, aseq, &ainfo, 50);
	if (ofile != NULL && !be_quiet)
	    printf("Alignment saved in file %s\n", ofile);

	/* Free memory
	 */
	for (i = 0; i < nseq; i++) 
	{
	    P7FreeTrace(tr[i]);
	    free(dsq[i]);
	}
	FreeAlignment(aseq, &ainfo);
	free(sqinfo);
	free(dsq);
	free(wgt);
	free(tr);
    }
    else				/* unaligned sequence output */
    {
	struct p7trace_s *tr;
	char             *dsq;
	char             *seq;
	SQINFO            sqinfo;

	for (i = 0; i < nseq; i++)
	{
	    EmitSequence(hmm, &dsq, &L, &tr);
	    sprintf(sqinfo.name, "seq%d", i+1);
	    sqinfo.len   = L;
	    sqinfo.flags = SQINFO_NAME | SQINFO_LEN;

	    seq = DedigitizeSequence(dsq, L);

	    WriteSeq(fp, kPearson, seq, &sqinfo);
	  
	    P7FreeTrace(tr);
	    free(dsq);
	    free(seq);
	}
    }

    ajFileClose(&outf);
  
    FreePlan7(hmm);
    SqdClean();

#ifdef MEMDEBUG
    current_size = malloc_inuse(&histid2);
    if (current_size != orig_size)
	malloc_list(2, histid1, histid2);
    else
	fprintf(stderr, "[No memory leaks.]\n");
#endif


    ajStrDel(&instr);
    ajStrDel(&outstr);
    ajFileClose(&inf);
    ajFileClose(&outf);

    embExit();
    return 0;
}
Beispiel #5
0
/* Function: EmitConsensusSequence()
 * Date:     SRE, Wed Nov 11 11:08:59 1998 [St. Louis]
 *
 * Purpose:  Generate a "consensus sequence". For the purposes
 *           of a profile HMM, this is defined as:
 *              - for each node:
 *                 - if StateOccupancy() says that M is used 
 *                     with probability >= 0.5, this M is "consensus".
 *                     Then, choose maximally likely residue.
 *                     if P>0.5 (protein) or P>0.9 (DNA), make
 *                     it upper case; else make it lower case. 
 *                 - if StateOccupancy() says that I
 *                     is used with P >= 0.5, this I is "consensus";
 *                     use it 1/(1-TII) times (its expectation value).
 *                     Generate an "x" from each I.
 *                     
 *           The function expects that the model is config'ed
 *           by Plan7NakedConfig(): that is, for a single global pass
 *           with no N,C,J involvement.
 *                     
 *
 * Args:     hmm     - the model
 *           ret_seq - RETURN: consensus sequence (pass NULL if unwanted)
 *           ret_dsq - RETURN: digitized consensus sequence (pass NULL if unwanted)
 *           ret_L   - RETURN: length of generated sequence 
 *           ret_tr  - RETURN: generated trace (pass NULL if unwanted)
 *
 * Returns:  void        
 */
void
EmitConsensusSequence(struct plan7_s *hmm, char **ret_seq, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr)
{
  struct p7trace_s *tr;         /* RETURN: traceback */
  char *dsq, *seq;              /* sequence in digitized and undigitized form */
  float *mp, *ip, *dp;          /* state occupancies from StateOccupancy() */
  int    nmat, ndel, nins;	/* number of matches, deletes, inserts used */
  int    k;			/* counter for nodes */
  int    tpos;			/* position in trace */
  int    i;                     /* position in seq (equiv pos in dsq is i+1 */
  int    x;			/* symbol choice (M) or # symbols (I) */
  float  mthresh;		/* >= this, show symbol as upper case */

  if (Alphabet_type == hmmAMINO) mthresh = 0.5;
  else                           mthresh = 0.9;

  StateOccupancy(hmm, &mp, &ip, &dp);

  /* First pass: how many states do we need in the trace?
   *             how long will the sequence be?
   */
  nmat = ndel = nins = 0;
  for (k = 1; k <= hmm->M; k++)
    {
      if (mp[k] >= 0.5) nmat++; else ndel++;
      if (k < hmm->M && ip[k] >= 0.5) 
	nins += (int) (1.f / (1.f - hmm->t[k][TII]));
    }
  
  /* Allocations
   */
  P7AllocTrace(6 + nmat + ndel + nins, &tr);
  dsq = MallocOrDie(sizeof(char) * (nmat+nins+3));
  seq = MallocOrDie(sizeof(char) * (nmat+nins+1));

  /* Main pass.
   * Construct consensus trace, seq, and dsq.
   */
  TraceSet(tr, 0, STS, 0, 0);
  TraceSet(tr, 1, STN, 0, 0);
  TraceSet(tr, 2, STB, 0, 0);
  dsq[0] = Alphabet_iupac;	/* guard byte */
  tpos = 3;
  i    = 0;
  for (k = 1; k <= hmm->M; k++)
    {
      if (mp[k] >= 0.5)
	{
	  x = FMax(hmm->mat[k], Alphabet_size);
	  TraceSet(tr, tpos, STM, k, i+1);
	  seq[i]   = Alphabet[x];
	  dsq[i+1] = x;
	  if (hmm->mat[k][x] < mthresh)
	    seq[i] = tolower((int) seq[i]);
	  i++;
	  tpos++;
	}
      else
	{
	  TraceSet(tr, tpos, STD, k, 0);
	  tpos++;
	}

      if (k < hmm->M && ip[k] >= 0.5)
	{
	  x = (int) (1.f / (1.f - hmm->t[k][TII]));
	  while (x--) 
	    {
	      TraceSet(tr, tpos, STI, k, i+1);
	      seq[i]   = 'x';
	      dsq[i+1] = Alphabet_iupac - 1;
	      i++; 
	      tpos++;
	    }
	}
    }
  TraceSet(tr, tpos, STE, 0, 0); tpos++;
  TraceSet(tr, tpos, STC, 0, 0); tpos++;
  TraceSet(tr, tpos, STT, 0, 0); tpos++;
  dsq[i+1] = Alphabet_iupac;
    
  free(mp);
  free(ip);
  free(dp);
  if (ret_seq != NULL) *ret_seq = seq; else free(seq);
  if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);
  if (ret_L   != NULL) *ret_L   = i;   
  if (ret_tr  != NULL) *ret_tr  = tr;  else P7FreeTrace(tr);
}
Beispiel #6
0
int main(int argc, char **argv) 
{
  const char      *hmmfile;	/* file to read HMMs from                  */
  HMMFILE         *hmmfp;       /* opened hmmfile for reading              */
  const char      *seqfile;     /* file to read target sequence from       */ 
  char           **rseq;        /* raw, unaligned sequences                */ 
  SQINFO          *sqinfo;      /* info associated with sequences          */
  char           **dsq;         /* digitized raw sequences                 */
  int              nseq;        /* number of sequences                     */  
  char           **aseq;        /* aligned sequences                       */
  AINFO            ainfo;       /* alignment information                   */
  float           *wgt;         /* per-sequence weights                    */
  int              i;
  struct plan7_s    *hmm;       /* HMM to align to                         */ 
  struct p7trace_s **tr;        /* traces for aligned sequences            */

  int   be_quiet;		/* TRUE to suppress verbose banner          */
  int   matchonly;		/* TRUE to show only match state syms       */
  const char *outfile;          /* optional alignment output file           */
  FILE *ofp;                    /* handle on alignment output file          */
  AjPFile ajwithali;          /* name of additional alignment file to align */
  AjPFile ajmapali;           /* name of additional alignment file to map   */
  AjBool ajmatch=ajFalse;
  AjPFile outf=NULL;
  AjPStr  outfname=NULL;
  AjPFile inf=NULL;
  AjPStr  infname=NULL;
  AjPSeqset seqset=NULL;
  AjPStr  ajseqfile=NULL;
  char*  mapali=NULL;
  char*  withali=NULL;
  
#ifdef MEMDEBUG
  unsigned long histid1, histid2, orig_size, current_size;
  orig_size = malloc_inuse(&histid1);
  fprintf(stderr, "[... memory debugging is ON ...]\n");
#endif

  /*********************************************** 
   * Parse command line
   ***********************************************/
  
  matchonly = FALSE;
  outfile   = NULL;
  be_quiet  = FALSE;
  withali   = NULL;
  mapali    = NULL;

  embInitPV("ohmmalign",argc,argv,"HMMER",VERSION);

  ajmatch = ajAcdGetBoolean("matchonly");
  if(ajmatch)
      matchonly=TRUE;
  else
      matchonly=FALSE;



  ajmapali = ajAcdGetInfile("mapalifile");
  if (ajmapali)
      mapali = ajCharNewS(ajFileGetNameS(ajmapali));
  ajFileClose(&ajmapali);
  ajwithali = ajAcdGetInfile("withalifile");
  if (ajwithali)
      withali = ajCharNewS(ajFileGetNameS(ajwithali));
  ajFileClose(&ajwithali);

  be_quiet=TRUE;



  outf = ajAcdGetOutfile("outfile");
  outfname = ajStrNewC((char *)ajFileGetNameC(outf));
  if(*ajStrGetPtr(outfname)>31)
      ajFileClose(&outf);
  outfile = ajStrGetPtr(outfname);

  inf = ajAcdGetInfile("hmmfile");
  infname = ajStrNewC((char *)ajFileGetNameC(inf));
  ajFileClose(&inf);
  hmmfile = ajStrGetPtr(infname);

  
  seqset = ajAcdGetSeqset("sequences");
  ajseqfile = ajStrNewC(ajStrGetPtr(seqset->Filename));
  seqfile = ajStrGetPtr(ajseqfile);
  

 /*********************************************** 
  * Open HMM file (might be in HMMERDB or current directory).
  * Read a single HMM from it.
  * 
  * Currently hmmalign disallows the J state and
  * only allows one domain per sequence. To preserve
  * the S/W entry information, the J state is explicitly
  * disallowed, rather than calling a Plan7*Config() function.
  * this is a workaround in 2.1 for the 2.0.x "yo!" bug.
  ***********************************************/

  if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
    ajFatal("Failed to open HMM file %s\n", hmmfile);
  if (!HMMFileRead(hmmfp, &hmm)) 
    ajFatal("Failed to read any HMMs from %s\n", hmmfile);
  HMMFileClose(hmmfp);
  if (hmm == NULL) 
    ajFatal("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
  hmm->xt[XTE][MOVE] = 1.;	      /* only 1 domain/sequence ("global" alignment) */
  hmm->xt[XTE][LOOP] = 0.;
  P7Logoddsify(hmm, TRUE);
				/* do we have the map we might need? */
  if (mapali != NULL && ! (hmm->flags & PLAN7_MAP))
    ajFatal("HMMER: HMM file %s has no map; you can't use --mapali.", hmmfile);

  /*********************************************** 
   * Open sequence file in current directory.
   * Read all seqs from it.
   ***********************************************/
/*
  if (! SeqfileFormat(seqfile, &format, NULL))
    switch (squid_errno) {
    case SQERR_NOFILE: 
      ajFatal("Sequence file %s could not be opened for reading", seqfile);
    case SQERR_FORMAT: 
    default:           
      ajFatal("Failed to determine format of sequence file %s", seqfile);
    }
  if (! ReadMultipleRseqs(seqfile, format, &rseq, &sqinfo, &nseq))
    ajFatal("Failed to read any sequences from file %s", seqfile);
*/

  emboss_rseqs(seqset,&rseq,&sqinfo,&nseq);

  /*********************************************** 
   * Show the banner
   ***********************************************/

  be_quiet=TRUE;
  if (! be_quiet) 
    {
/*      Banner(stdout, banner); */
      printf(   "HMM file:             %s\n", hmmfile);
      printf(   "Sequence file:        %s\n", seqfile);
      printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");
    }

  /*********************************************** 
   * Do the work
   ***********************************************/

  /* Allocations and initializations.
   */
  dsq = MallocOrDie(sizeof(char *) * nseq);
  tr  = MallocOrDie(sizeof(struct p7trace_s *) * nseq);

  /* Align each sequence to the model, collect traces
   */
  for (i = 0; i < nseq; i++)
    {
      dsq[i] = DigitizeSequence(rseq[i], sqinfo[i].len);

      if (P7ViterbiSize(sqinfo[i].len, hmm->M) <= RAMLIMIT)
	(void) P7Viterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
      else
	(void) P7SmallViterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
    }

  /* Include an aligned alignment, if desired.
   */
  if (mapali != NULL)
    include_alignment(mapali, hmm, TRUE, &rseq, &dsq, &sqinfo, &tr, &nseq);
  if (withali != NULL) 
    include_alignment(withali, hmm, FALSE, &rseq, &dsq, &sqinfo, &tr, &nseq);

  /* Turn traces into a multiple alignment
   */ 
  wgt = MallocOrDie(sizeof(float) * nseq);
  FSet(wgt, nseq, 1.0);
  P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, matchonly,
		     &aseq, &ainfo);

  /*********************************************** 
   * Output the alignment
   ***********************************************/

  if (outfile != NULL && (ofp = fopen(outfile, "w")) != NULL)
    {
      WriteSELEX(ofp, aseq, &ainfo, 50);
      printf("Alignment saved in file %s\n", outfile);
      fclose(ofp);
    }
  else
    WriteSELEX(stdout, aseq, &ainfo, 50);

  /*********************************************** 
   * Cleanup and exit
   ***********************************************/
  
  for (i = 0; i < nseq; i++) 
    {
      P7FreeTrace(tr[i]);
      FreeSequence(rseq[i], &(sqinfo[i]));
      free(dsq[i]);
    }
  FreeAlignment(aseq, &ainfo);
  FreePlan7(hmm);
  free(sqinfo);
  free(rseq);
  free(dsq);
  free(wgt);
  free(tr);

  SqdClean();

  ajStrDel(&outfname);
  ajStrDel(&infname);
  ajStrDel(&ajseqfile);
  

#ifdef MEMDEBUG
  current_size = malloc_inuse(&histid2);
  if (current_size != orig_size) malloc_list(2, histid1, histid2);
  else fprintf(stderr, "[No memory leaks.]\n");
#endif

  ajSeqsetDel(&seqset);
  ajFileClose(&ajwithali);
  ajFileClose(&ajmapali);

  embExit();
  
  return 0;
}
Beispiel #7
0
/* Function: include_alignment()
 * Date:     SRE, Sun Jul  5 15:25:13 1998 [St. Louis]
 *
 * Purpose:  Given the name of a multiple alignment file,
 *           align that alignment to the HMM, and add traces
 *           to an existing array of traces. If do_mapped
 *           is TRUE, we use the HMM's map file. If not,
 *           we use P7ViterbiAlignAlignment().
 *
 * Args:     seqfile  - name of alignment file
 *           hmm      - model to align to
 *           do_mapped- TRUE if we're to use the HMM's alignment map
 *           rsq      - RETURN: array of rseqs to add to
 *           dsq      - RETURN: array of dsq to add to
 *           sqinfo   - RETURN: array of SQINFO to add to
 *           tr       - RETURN: array of traces to add to
 *           nseq     - RETURN: number of seqs           
 *
 * Returns:  new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is
 *           increased to nseq+ainfo.nseq.
 */
void
include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped,
		  char ***rsq, char ***dsq, SQINFO **sqinfo, 
		  struct p7trace_s ***tr, int *nseq)
{
  int format;			/* format of alignment file */
  char **aseq;			/* aligned seqs             */
  char **newdsq;
  char **newrseq;
  AINFO ainfo;			/* info that goes with aseq */
  int   idx;			/* counter over aseqs       */
  struct p7trace_s *master;     /* master trace             */
  struct p7trace_s **addtr;     /* individual traces for aseq */

  if (! SeqfileFormat(seqfile, &format, NULL))
    switch (squid_errno) {
    case SQERR_NOFILE: 
      ajFatal("Alignment file %s could not be opened for reading", seqfile);
      /*FALLTHRU*/ /* a white lie to shut lint up */
    case SQERR_FORMAT: 
    default:           
      ajFatal("Failed to determine format of alignment file %s", seqfile);
    }
				/* read the alignment from file */
  if (! ReadAlignment(seqfile, format, &aseq, &ainfo))
    ajFatal("Failed to read aligned sequence file %s", seqfile);
  for (idx = 0; idx < ainfo.nseq; idx++)
    s2upper(aseq[idx]);
				/* Verify checksums before mapping */
  if (do_mapped && GCGMultchecksum(aseq, ainfo.nseq) != hmm->checksum)
    ajFatal("The checksums for alignment file %s and the HMM alignment map don't match.", 
	seqfile);
				/* Get a master trace */
  if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, ainfo.alen);
  else           master = P7ViterbiAlignAlignment(aseq, &ainfo, hmm);

				/* convert to individual traces */
  ImposeMasterTrace(aseq, ainfo.nseq, master, &addtr);
				/* add those traces to existing ones */
  *tr = MergeTraceArrays(*tr, *nseq, addtr, ainfo.nseq);
  
				/* additional bookkeeping: add to dsq, sqinfo */
  *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DealignAseqs(aseq, ainfo.nseq, &newrseq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*rsq)[idx] = newrseq[idx - (*nseq)];
  free(newrseq);

  *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DigitizeAlignment(aseq, &ainfo, &newdsq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*dsq)[idx] = newdsq[idx - (*nseq)];
  free(newdsq);
				/* unnecessarily complex, but I can't be bothered... */
  *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + ainfo.nseq));
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    SeqinfoCopy(&((*sqinfo)[idx]), &(ainfo.sqinfo[idx - (*nseq)]));
  
  *nseq = *nseq + ainfo.nseq;

				/* Cleanup */
  P7FreeTrace(master);
  FreeAlignment(aseq, &ainfo);
				/* Return */
  return;
}