Пример #1
0
/* Function: P7PriorifyHMM()
 * 
 * Purpose:  Add pseudocounts to an HMM using Dirichlet priors,
 *           and renormalize the HMM.
 * 
 * Args:     hmm -- the HMM to add counts to (counts form)
 *           pri -- the Dirichlet prior to use
 *           
 * Return:   (void)
 *           HMM returns in probability form.
 */          
void
P7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri)
{
  int k;			/* counter for model position   */
  float d;			/* a denominator */

  /* Model-dependent transitions are handled simply; Laplace.
   */
  FSet(hmm->begin+2, hmm->M-1, 0.);     /* wipe internal BM entries */
  FSet(hmm->end+1, hmm->M-1, 0.);	/* wipe internal ME exits   */
  d = hmm->tbd1 + hmm->begin[1] + 2.;
  hmm->tbd1        = (hmm->tbd1 + 1.)/ d;
  hmm->begin[1]    = (hmm->begin[1] + 1.)/ d;
  hmm->end[hmm->M] = 1.0;

  /* Main model transitions and emissions
   */
  for (k = 1; k < hmm->M; k++)
    {
      P7PriorifyTransitionVector(hmm->t[k], pri);
      P7PriorifyEmissionVector(hmm->mat[k], pri, pri->mnum, pri->mq, pri->m, NULL);
      P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, pri->iq, pri->i, NULL);
    }
  P7PriorifyEmissionVector(hmm->mat[hmm->M], pri, pri->mnum, pri->mq, pri->m, NULL);

  Plan7Renormalize(hmm);
}
/* Function: default_nucleic_prior()
 * 
 * Purpose:  Set the default DNA prior. (for now, almost a Laplace)
 */
static struct p7prior_s *
default_nucleic_prior(void)
{
  struct p7prior_s *pri;

  pri = P7AllocPrior();
  pri->strategy = PRI_DCHLET;

  /* The use of the Pfam-trained amino acid transition priors
   * here is TOTALLY bogus. But it works better than a straight
   * Laplace, esp. for Maxmodelmaker(). For example, a Laplace 
   * prior builds M=1 models for a single sequence GAATTC (at
   * one time an open "bug").
   */
  pri->tnum        = 1;
  pri->tq[0]       = 1.;
  pri->t[0][TMM]   = 0.7939;
  pri->t[0][TMI]   = 0.0278;
  pri->t[0][TMD]   = 0.0135;
  pri->t[0][TIM]   = 0.1551;
  pri->t[0][TII]   = 0.1331;
  pri->t[0][TDM]   = 0.9002;
  pri->t[0][TDD]   = 0.5630;
  
  pri->mnum  = 1;
  pri->mq[0] = 1.;
  FSet(pri->m[0], Alphabet_size, 1.);

  pri->inum  = 1;
  pri->iq[0] = 1.;
  FSet(pri->i[0], Alphabet_size, 1.);

  return pri;
}
Пример #3
0
/* Function: StrMarkov0()
 * Date:     SRE, Fri Oct 29 11:08:31 1999 [St. Louis]
 *
 * Purpose:  Returns a random string s1 with the same
 *           length and zero-th order Markov properties
 *           as s2. 
 *           
 *           s1 and s2 may be identical, to randomize s2
 *           in place.
 *
 * Args:     s1 - allocated space for random string
 *           s2 - string to base s1's properties on.
 *
 * Returns:  1 on success; 0 if s2 doesn't look alphabetical.
 */
int 
StrMarkov0(char *s1, char *s2)
{
  int   len;
  int   pos; 
  float p[26];			/* symbol probabilities */

  /* First, verify that the string is entirely alphabetic.
   */
  len = strlen(s2);
  for (pos = 0; pos < len; pos++)
    if (! isalpha(s2[pos])) return 0;

  /* Collect zeroth order counts and convert to frequencies.
   */
  FSet(p, 26, 0.);
  for (pos = 0; pos < len; pos++)
    p[(int)(toupper(s2[pos]) - 'A')] += 1.0;
  FNorm(p, 26);

  /* Generate a random string using those p's.
   */
  for (pos = 0; pos < len; pos++)
    s1[pos] = FChoose(p, 26) + 'A';
  s1[pos] = '\0';

  return 1;
}
Пример #4
0
/* Function: Plan7FSConfig()
 * Date:     SRE, Fri Jan  2 15:34:40 1998 [StL]
 * 
 * Purpose:  Set the alignment independent parameters of
 *           a Plan7 model to hmmfs (multihit Smith/Waterman) configuration.
 *           
 *           See comments on Plan7SWConfig() for explanation of
 *           how pentry and pexit are used.
 *           
 * Args:     hmm    - the Plan7 model w/ data-dep prob's valid
 *           pentry - probability of an internal entry somewhere;
 *                    will be evenly distributed over M-1 match states
 *           pexit  - probability of an internal exit somewhere; 
 *                    will be distributed over M-1 match states.
 *                    
 * Return:   (void)
 *           HMM probabilities are modified.
 */
void
Plan7FSConfig(struct plan7_s *hmm, float pentry, float pexit)
{
  float basep;			/* p1 for exits: the base p */
  int   k;			/* counter over states      */

  /* Configure special states.
   */
  hmm->xt[XTN][MOVE] = 1-hmm->p1;    /* allow N-terminal tail     */
  hmm->xt[XTN][LOOP] = hmm->p1;
  hmm->xt[XTE][MOVE] = 0.5;	     /* allow loops / multihits   */
  hmm->xt[XTE][LOOP] = 0.5;
  hmm->xt[XTC][MOVE] = 1-hmm->p1;    /* allow C-terminal tail     */
  hmm->xt[XTC][LOOP] = hmm->p1;
  hmm->xt[XTJ][MOVE] = 1.-hmm->p1;   /* allow J junction between domains */
  hmm->xt[XTJ][LOOP] = hmm->p1;

  /* Configure entry.
   */
  hmm->begin[1] = (1. - pentry) * (1. - hmm->tbd1);
  FSet(hmm->begin+2, hmm->M-1, (pentry * (1.-hmm->tbd1)) / (float)(hmm->M-1));
  
  /* Configure exit.
   */
  hmm->end[hmm->M] = 1.0;
  basep = pexit / (float) (hmm->M-1);
  for (k = 1; k < hmm->M; k++)
    hmm->end[k] = basep / (1. - basep * (float) (k-1));
  Plan7RenormalizeExits(hmm);
  hmm->flags       &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
}
Пример #5
0
/* Function: Plan7LSConfig()
 * 
 * Purpose:  Set the alignment independent parameters of a Plan7 model
 *           to hmmls (global in HMM, local in sequence) configuration.
 *           
 * Args:     hmm  - the plan7 model
 *                 
 * Return:   (void);
 *           the HMM probabilities are modified.
 */
void
Plan7LSConfig(struct plan7_s *hmm)
{
  hmm->xt[XTN][MOVE] = 1.-hmm->p1;    /* allow N-terminal tail */
  hmm->xt[XTN][LOOP] = hmm->p1;
  hmm->xt[XTE][MOVE] = 0.5;	     /* expectation 2 domains/seq */
  hmm->xt[XTE][LOOP] = 0.5;
  hmm->xt[XTC][MOVE] = 1.-hmm->p1;    /* allow C-terminal tail */
  hmm->xt[XTC][LOOP] = hmm->p1;
  hmm->xt[XTJ][MOVE] = 1.-hmm->p1;   /* allow J junction state */
  hmm->xt[XTJ][LOOP] = hmm->p1;
  FSet(hmm->begin+2, hmm->M-1, 0.);  /* start at M1/D1 */
  hmm->begin[1]    = 1. - hmm->tbd1;
  FSet(hmm->end+1,   hmm->M-1, 0.);  /* end at M_m/D_m */
  hmm->end[hmm->M] = 1.;
  Plan7RenormalizeExits(hmm);
  hmm->flags       &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
}  
Пример #6
0
/* Function: Plan7GlobalConfig()
 * 
 * Purpose:  Set the alignment-independent, algorithm-dependent parameters
 *           of a Plan7 model to global (Needleman/Wunsch) configuration.
 * 
 *           Like a non-looping hmmls, since we actually allow flanking
 *           N and C terminal sequence. 
 *           
 * Args:     hmm - the plan7 model
 *                 
 * Return:   (void)
 *           The HMM is modified; algorithm dependent parameters are set.
 *           Previous scores are invalidated if they existed.
 */
void
Plan7GlobalConfig(struct plan7_s *hmm)                           
{
  hmm->xt[XTN][MOVE] = 1. - hmm->p1;  /* allow N-terminal tail */
  hmm->xt[XTN][LOOP] = hmm->p1;
  hmm->xt[XTE][MOVE] = 1.;	      /* only 1 domain/sequence ("global" alignment) */
  hmm->xt[XTE][LOOP] = 0.;
  hmm->xt[XTC][MOVE] = 1. - hmm->p1;  /* allow C-terminal tail */
  hmm->xt[XTC][LOOP] = hmm->p1;
  hmm->xt[XTJ][MOVE] = 0.;	      /* J state unused */
  hmm->xt[XTJ][LOOP] = 1.;
  FSet(hmm->begin+2, hmm->M-1, 0.);   /* disallow internal entries. */
  hmm->begin[1]    = 1. - hmm->tbd1;
  FSet(hmm->end+1,   hmm->M-1, 0.);   /* disallow internal exits. */
  hmm->end[hmm->M] = 1.;
  Plan7RenormalizeExits(hmm);
  hmm->flags       &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
}
Пример #7
0
/* Function: Plan7ESTConfig()
 * 
 * Purpose:  Configure a Plan7 model for EST Smith/Waterman
 *           analysis.
 *           
 *           OUTDATED; DO NOT USE WITHOUT RECHECKING
 *           
 * Args:     hmm        - hmm to configure.
 *           aacode     - 0..63 vector mapping genetic code to amino acids
 *           estmodel   - 20x64 translation matrix, w/ codon bias and substitution error
 *           dna2       - probability of a -1 frameshift in a triplet
 *           dna4       - probability of a +1 frameshift in a triplet     
 */ 
void
Plan7ESTConfig(struct plan7_s *hmm, int *aacode, float **estmodel, 
	       float dna2, float dna4)
{
  int k;
  int x;
  float p;
  float *tripnull;		/* UNFINISHED!!! */

				/* configure specials */
  hmm->xt[XTN][MOVE] = 1./351.;
  hmm->xt[XTN][LOOP] = 350./351.;
  hmm->xt[XTE][MOVE] = 1.;
  hmm->xt[XTE][LOOP] = 0.;
  hmm->xt[XTC][MOVE] = 1./351.;
  hmm->xt[XTC][LOOP] = 350./351.;
  hmm->xt[XTJ][MOVE] = 1.;
  hmm->xt[XTJ][LOOP] = 0.;
				/* configure entry/exit */
  hmm->begin[1] = 0.5;
  FSet(hmm->begin+2, hmm->M-1, 0.5 / ((float)hmm->M - 1.));
  hmm->end[hmm->M] = 1.;
  FSet(hmm->end, hmm->M-1, 0.5 / ((float)hmm->M - 1.));

				/* configure dna triplet/frameshift emissions */
  for (k = 1; k <= hmm->M; k++)
    {
				/* translate aa to triplet probabilities */
      for (x = 0; x < 64; x++) {
	p =  hmm->mat[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4);
	hmm->dnam[x][k] = Prob2Score(p, tripnull[x]);

	p = hmm->ins[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4);
	hmm->dnai[x][k] = Prob2Score(p, tripnull[x]);
      }
      hmm->dnam[64][k] = 0;	/* ambiguous codons score 0 (danger?) */
      hmm->dna2 = Prob2Score(dna2, 1.);
      hmm->dna4 = Prob2Score(dna4, 1.);
    }
}
/* Function: P7LaplacePrior()
 * 
 * Purpose:  Create a Laplace plus-one prior. (single component Dirichlets). 
 *           Global alphabet info is assumed to have been set already.
 *
 * Args:     (void)
 *
 * Return:   prior. Allocated here; call FreePrior() to free it.
 */ 
struct p7prior_s *
P7LaplacePrior(void)
{
  struct p7prior_s *pri;
  
  pri = P7AllocPrior();
  pri->strategy = PRI_DCHLET;

  pri->tnum     = 1;
  pri->tq[0]    = 1.;
  FSet(pri->t[0], 8, 1.); 
  
  pri->mnum  = 1;
  pri->mq[0] = 1.;
  FSet(pri->m[0], Alphabet_size, 1.);

  pri->inum  = 1;
  pri->iq[0] = 1.;
  FSet(pri->i[0], Alphabet_size, 1.);

  return pri;
}
Пример #9
0
/* Function: StrMarkov1()
 * Date:     SRE, Fri Oct 29 11:22:20 1999 [St. Louis]
 *
 * Purpose:  Returns a random string s1 with the same
 *           length and first order Markov properties
 *           as s2. 
 *           
 *           s1 and s2 may be identical, to randomize s2
 *           in place.
 *
 * Args:     s1 - allocated space for random string
 *           s2 - string to base s1's properties on.
 *
 * Returns:  1 on success; 0 if s2 doesn't look alphabetical.
 */
int 
StrMarkov1(char *s1, char *s2)
{
  int   len;
  int   pos; 
  int   x,y;
  int   i;			/* initial symbol */
  float p[26][26];		/* symbol probabilities */

  /* First, verify that the string is entirely alphabetic.
   */
  len = strlen(s2);
  for (pos = 0; pos < len; pos++)
    if (! isalpha(s2[pos])) return 0;

  /* Collect first order counts and convert to frequencies.
   */
  for (x = 0; x < 26; x++) FSet(p[x], 26, 0.);

  i = x = toupper(s2[0]) - 'A';
  for (pos = 1; pos < len; pos++)
    {
      y = toupper(s2[pos]) - 'A';
      p[x][y] += 1.0; 
      x = y;
    }
  for (x = 0; x < 26; x++) 
    FNorm(p[x], 26);

  /* Generate a random string using those p's.
   */
  x = i;
  s1[0] = x + 'A';
  for (pos = 1; pos < len; pos++)
    {
      y = FChoose(p[x], 26);
      s1[pos] = y + 'A';
      x = y;
    } 
  s1[pos] = '\0';

  return 1;
}
Пример #10
0
/* Function: ZeroPlan7()
 * 
 * Purpose:  Zeros the counts/probabilities fields in a model.  
 *           Leaves null model untouched. 
 */
void
ZeroPlan7(struct plan7_s *hmm)
{
  int k;
  for (k = 1; k < hmm->M; k++)
    {
      FSet(hmm->t[k], 7, 0.);
      FSet(hmm->mat[k], Alphabet_size, 0.);
      FSet(hmm->ins[k], Alphabet_size, 0.);
    }
  FSet(hmm->mat[hmm->M], Alphabet_size, 0.);
  hmm->tbd1 = 0.;
  FSet(hmm->begin+1, hmm->M, 0.);
  FSet(hmm->end+1, hmm->M, 0.);
  for (k = 0; k < 4; k++)
    FSet(hmm->xt[k], 2, 0.);
  hmm->flags &= ~PLAN7_HASBITS;	/* invalidates scores */
  hmm->flags &= ~PLAN7_HASPROB;	/* invalidates probabilities */
}
/* Function: P7PriorifyHMM()
 * 
 * Purpose:  Add pseudocounts to an HMM using Dirichlet priors,
 *           and renormalize the HMM.
 * 
 * Args:     hmm -- the HMM to add counts to (counts form)
 *           pri -- the Dirichlet prior to use
 *           
 * Return:   (void)
 *           HMM returns in probability form.
 */          
void
P7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri)
{
  int k;			/* counter for model position   */
  float d;			/* a denominator */
  float tq[MAXDCHLET];		/* prior distribution over mixtures */
  float mq[MAXDCHLET];		/* prior distribution over mixtures */
  float iq[MAXDCHLET];		/* prior distribution over mixtures */

  /* Model-dependent transitions are handled simply; Laplace.
   */
  FSet(hmm->begin+2, hmm->M-1, 0.);     /* wipe internal BM entries */
  FSet(hmm->end+1, hmm->M-1, 0.);	/* wipe internal ME exits   */
  d = hmm->tbd1 + hmm->begin[1] + 2.;
  hmm->tbd1        = (hmm->tbd1 + 1.)/ d;
  hmm->begin[1]    = (hmm->begin[1] + 1.)/ d;
  hmm->end[hmm->M] = 1.0;

  /* Main model transitions and emissions
   */
  for (k = 1; k < hmm->M; k++)
    {
      /* The following code chunk is experimental. 
       * Collaboration with Michael Asman, Erik Sonnhammer, CGR Stockholm.
       * Only activated if X-PR* annotation has been used, in which
       * priors are overridden and a single Dirichlet component is
       * specified for each column (using structural annotation).
       * If X-PR* annotation is not used, which is usually the case, 
       * the following code has no effect (observe how the real prior 
       * distributions are copied into tq, mq, iq).
       */
      if (hmm->tpri != NULL && hmm->tpri[k] >= 0)
	{
	  if (hmm->tpri[k] >= pri->tnum) Die("X-PRT annotation out of range");
	  FSet(tq, pri->tnum, 0.0);
	  tq[hmm->tpri[k]] = 1.0;
	}
      else 
	FCopy(tq, pri->tq, pri->tnum);
      if (hmm->mpri != NULL && hmm->mpri[k] >= 0)
	{
	  if (hmm->mpri[k] >= pri->mnum) Die("X-PRM annotation out of range");
	  FSet(mq, pri->mnum, 0.0);
	  mq[hmm->mpri[k]] = 1.0;
	}
      else 
	FCopy(mq, pri->mq, pri->mnum);
      if (hmm->ipri != NULL && hmm->ipri[k] >= 0)
	{
	  if (hmm->ipri[k] >= pri->inum) Die("X-PRI annotation out of range");
	  FSet(iq, pri->inum, 0.0);
	  iq[hmm->ipri[k]] = 1.0;
	}
      else 
	FCopy(iq, pri->iq, pri->inum);

      /* This is the main line of the code:
       */
      P7PriorifyTransitionVector(hmm->t[k], pri, tq);
      P7PriorifyEmissionVector(hmm->mat[k], pri, pri->mnum, mq, pri->m, NULL);
      P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, iq, pri->i, NULL);
    }

  /* We repeat the above steps just for the final match state, M.
   */
  if (hmm->mpri != NULL && hmm->mpri[hmm->M] >= 0)
    {
      if (hmm->mpri[hmm->M] >= pri->mnum) Die("X-PRM annotation out of range");
      FSet(mq, pri->mnum, 0.0);
      mq[hmm->mpri[hmm->M]] = 1.0;
    }
  else 
    FCopy(mq, pri->mq, pri->mnum);

  P7PriorifyEmissionVector(hmm->mat[hmm->M], pri, pri->mnum, mq, pri->m, NULL);

  /* Now we're done. Convert the counts-based HMM to probabilities.
   */
  Plan7Renormalize(hmm);
}
Пример #12
0
int main(int argc, char **argv) 
{
    const char      *hmmfile;	/* file to read HMMs from                  */
    FILE            *fp;	/* output file handle                      */
    HMMFILE         *hmmfp;	/* opened hmmfile for reading              */
    struct plan7_s  *hmm;	/* HMM to generate from                    */
    int              L;		/* length of a sequence                    */
    int              i;		/* counter over sequences                  */

    char            *ofile;	/* output sequence file                    */
    int              nseq;	/* number of seqs to sample                */
    int              seed;	/* random number generator seed            */
    int              be_quiet;	/* TRUE to silence header/footer           */
    int              do_alignment; /* TRUE to output in aligned format     */ 
    int              do_consensus; /* TRUE to do a single consensus seq    */

    AjBool ajselex;
    AjBool ajcons;
    AjPFile inf=NULL;
    AjPFile outf=NULL;
    AjPStr  instr=NULL;
    AjPStr  outstr=NULL;
  

#ifdef MEMDEBUG
    unsigned long histid1, histid2, orig_size, current_size;
    orig_size = malloc_inuse(&histid1);
    fprintf(stderr, "[... memory debugging is ON ...]\n");
#endif

    /*********************************************** 
     * Parse command line
     ***********************************************/

    nseq         = 10;

    be_quiet     = FALSE;
    do_alignment = FALSE;  
    do_consensus = FALSE;
    ofile        = NULL;

    embInitPV("ohmmemit",argc,argv,"HMMER",VERSION);

    ajselex = ajAcdGetBoolean("selex");
    ajcons  = ajAcdGetBoolean("consensus");
    nseq    = ajAcdGetInt("number");
    seed    = ajAcdGetInt("seed");
    inf     = ajAcdGetInfile("infile");
    outf    = ajAcdGetOutfile("outfile");
  
    if(!seed)
	seed = time ((time_t *) NULL);

    if(ajselex)
	do_alignment=TRUE;
    else
	do_alignment=FALSE;
  
    if(ajcons)
	do_consensus=TRUE;
    else
	do_consensus=FALSE;

    instr  = ajStrNewC((char *)ajFileGetNameC(inf));
    outstr = ajStrNewC((char *)ajFileGetNameC(outf));

    hmmfile = ajStrGetPtr(instr);

    sre_srandom(seed);

    if (do_alignment && do_consensus)
	ajFatal("Sorry, -selex and -consensus are incompatible.\n"); 
    if (nseq != 10 && do_consensus)
	ajWarn("-consensus overrides -number (# of sampled seqs)");

    /*********************************************** 
     * Open HMM file (might be in HMMERDB or current directory).
     * Read a single HMM from it.
     ***********************************************/

    if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
	ajFatal("Failed to open HMM file %s\n", hmmfile);
    if (!HMMFileRead(hmmfp, &hmm)) 
	ajFatal("Failed to read any HMMs from %s\n", hmmfile);
    HMMFileClose(hmmfp);
    if (hmm == NULL) 
	ajFatal("HMM file %s corrupt or in incorrect format? Parse failed",
		hmmfile);

    /* Configure the HMM to shut off N,J,C emission: so we
     * do a simple single pass through the model.
     */
    Plan7NakedConfig(hmm);
    Plan7Renormalize(hmm);

    /*********************************************** 
     * Open the output file, or stdout
     ***********************************************/ 

    fp = ajFileGetFileptr(outf);
  
 
    /*********************************************** 
     * Show the options banner
     ***********************************************/
    be_quiet=TRUE;
    if (! be_quiet) 
    {
	printf("HMM file:             %s\n", hmmfile);
	if (! do_consensus)
	{
	    printf("Number of seqs:       %d\n", nseq);
	    printf("Random seed:          %d\n", seed);
	}
	printf("- - - - - - - - - - - - - - - - - - - - - - - - - "
	       "- - - - - - -\n\n");
    }

    /*********************************************** 
     * Do the work.
     * If we're generating an alignment, we have to collect
     * all our traces, then output. If we're generating unaligned
     * sequences, we can emit one at a time.
     ***********************************************/

    if (do_consensus) 
    {
	char    *seq;
	SQINFO   sqinfo;	/* info about sequence (name/desc)        */

	EmitConsensusSequence(hmm, &seq, NULL, &L, NULL);
	strcpy(sqinfo.name, "consensus");
	sqinfo.len = L;
	sqinfo.flags = SQINFO_NAME | SQINFO_LEN;

	WriteSeq(fp, kPearson, seq, &sqinfo);
	free(seq);
    }
    else if (do_alignment)
    {
	struct p7trace_s **tr;
	char           **dsq;
	SQINFO          *sqinfo;
	char           **aseq;
	AINFO            ainfo;
	float           *wgt;

	dsq    = MallocOrDie(sizeof(char *)             * nseq);
	tr     = MallocOrDie(sizeof(struct p7trace_s *) * nseq);
	sqinfo = MallocOrDie(sizeof(SQINFO)             * nseq);
	wgt    = MallocOrDie(sizeof(float)              * nseq);
	FSet(wgt, nseq, 1.0);

	for (i = 0; i < nseq; i++)
	{
	    EmitSequence(hmm, &(dsq[i]), &L, &(tr[i]));
	    sprintf(sqinfo[i].name, "seq%d", i+1);
	    sqinfo[i].len   = L;
	    sqinfo[i].flags = SQINFO_NAME | SQINFO_LEN;
	}

	P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, FALSE, 
			   &aseq, &ainfo);

	/* Output the alignment */
	WriteSELEX(fp, aseq, &ainfo, 50);
	if (ofile != NULL && !be_quiet)
	    printf("Alignment saved in file %s\n", ofile);

	/* Free memory
	 */
	for (i = 0; i < nseq; i++) 
	{
	    P7FreeTrace(tr[i]);
	    free(dsq[i]);
	}
	FreeAlignment(aseq, &ainfo);
	free(sqinfo);
	free(dsq);
	free(wgt);
	free(tr);
    }
    else				/* unaligned sequence output */
    {
	struct p7trace_s *tr;
	char             *dsq;
	char             *seq;
	SQINFO            sqinfo;

	for (i = 0; i < nseq; i++)
	{
	    EmitSequence(hmm, &dsq, &L, &tr);
	    sprintf(sqinfo.name, "seq%d", i+1);
	    sqinfo.len   = L;
	    sqinfo.flags = SQINFO_NAME | SQINFO_LEN;

	    seq = DedigitizeSequence(dsq, L);

	    WriteSeq(fp, kPearson, seq, &sqinfo);
	  
	    P7FreeTrace(tr);
	    free(dsq);
	    free(seq);
	}
    }

    ajFileClose(&outf);
  
    FreePlan7(hmm);
    SqdClean();

#ifdef MEMDEBUG
    current_size = malloc_inuse(&histid2);
    if (current_size != orig_size)
	malloc_list(2, histid1, histid2);
    else
	fprintf(stderr, "[No memory leaks.]\n");
#endif


    ajStrDel(&instr);
    ajStrDel(&outstr);
    ajFileClose(&inf);
    ajFileClose(&outf);

    embExit();
    return 0;
}
Пример #13
0
/* Function: MSAVerifyParse()
 * Date:     SRE, Sat Jun  5 14:24:24 1999 [Madison, 1999 worm mtg]
 *
 * Purpose:  Last function called after a multiple alignment is
 *           parsed. Checks that parse was successful; makes sure
 *           required information is present; makes sure required
 *           information is consistent. Some fields that are
 *           only use during parsing may be freed (sqlen, for
 *           example).
 *           
 *           Some fields in msa may be modified (msa->alen is set,
 *           for example).
 *
 * Args:     msa - the multiple alignment
 *                 sqname, aseq must be set
 *                 nseq must be correct
 *                 alen need not be set; will be set here.
 *                 wgt will be set here if not already set
 *
 * Returns:  (void)
 *           Will Die() here with diagnostics on error.
 *
 * Example:  
 */
void
MSAVerifyParse(MSA *msa)
{
  int idx;

  if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s",
			  msa->name != NULL ? msa->name : "");

  msa->alen = msa->sqlen[0];

  /* We can rely on msa->sqname[] being valid for any index,
   * because of the way the line parsers always store any name
   * they add to the index.
   */
  for (idx = 0; idx < msa->nseq; idx++)
    {
				/* aseq is required. */
      if (msa->aseq[idx] == NULL) 
	Die("Parse error: No sequence for %s in alignment %s", msa->sqname[idx],
	    msa->name != NULL ? msa->name : "");
				/* either all weights must be set, or none of them */
      if ((msa->flags & MSA_SET_WGT) && msa->wgt[idx] == -1.0)
	Die("Parse error: some weights are set, but %s doesn't have one in alignment %s", 
	    msa->sqname[idx],
	    msa->name != NULL ? msa->name : "");
				/* all aseq must be same length. */
      if (msa->sqlen[idx] != msa->alen)
	Die("Parse error: sequence %s: length %d, expected %d in alignment %s",
	    msa->sqname[idx], msa->sqlen[idx], msa->alen,
	    msa->name != NULL ? msa->name : "");
				/* if SS is present, must have length right */
      if (msa->ss != NULL && msa->ss[idx] != NULL && msa->sslen[idx] != msa->alen) 
	Die("Parse error: #=GR SS annotation for %s: length %d, expected %d in alignment %s",
	    msa->sqname[idx], msa->sslen[idx], msa->alen,
	    msa->name != NULL ? msa->name : "");
				/* if SA is present, must have length right */
      if (msa->sa != NULL && msa->sa[idx] != NULL && msa->salen[idx] != msa->alen) 
	Die("Parse error: #=GR SA annotation for %s: length %d, expected %d in alignment %s",
	    msa->sqname[idx], msa->salen[idx], msa->alen,
	    msa->name != NULL ? msa->name : "");
    }

			/* if cons SS is present, must have length right */
  if (msa->ss_cons != NULL && strlen(msa->ss_cons) != msa->alen) 
    Die("Parse error: #=GC SS_cons annotation: length %d, expected %d in alignment %s",
	strlen(msa->ss_cons), msa->alen,
	msa->name != NULL ? msa->name : "");

			/* if cons SA is present, must have length right */
  if (msa->sa_cons != NULL && strlen(msa->sa_cons) != msa->alen) 
    Die("Parse error: #=GC SA_cons annotation: length %d, expected %d in alignment %s",
	strlen(msa->sa_cons), msa->alen,
	msa->name != NULL ? msa->name : "");

				/* if RF is present, must have length right */
  if (msa->rf != NULL && strlen(msa->rf) != msa->alen) 
    Die("Parse error: #=GC RF annotation: length %d, expected %d in alignment %s",
	strlen(msa->rf), msa->alen,
	msa->name != NULL ? msa->name : "");

				/* Check that all or no weights are set */
  if (!(msa->flags & MSA_SET_WGT))
    FSet(msa->wgt, msa->nseq, 1.0); /* default weights */

				/* Clean up a little from the parser */
  if (msa->sqlen != NULL) { free(msa->sqlen); msa->sqlen = NULL; }
  if (msa->sslen != NULL) { free(msa->sslen); msa->sslen = NULL; }
  if (msa->salen != NULL) { free(msa->salen); msa->salen = NULL; }

  return;
}
Пример #14
0
/* Function: ReadSELEX()
 * Date:     SRE, Sun Jun  6 18:24:09 1999 [St. Louis]
 *
 * Purpose:  Parse an alignment read from an open SELEX format
 *           alignment file. (SELEX is a single alignment format).
 *           Return the alignment, or NULL if we've already read the
 *           alignment or there's no alignment data in the file.
 *           
 * Limitations: SELEX is the only remaining multipass parser for
 *           alignment files. It cannot read from gzip or from stdin.
 *           It Die()'s here if you try. The reason for this
 *           that SELEX allows space characters as gaps, so we don't
 *           know the borders of an alignment block until we've seen
 *           the whole block. I could rewrite to allow single-pass
 *           parsing (by storing the whole block in memory) but
 *           since SELEX is now legacy, why bother.
 *           
 *           Note that the interface is totally kludged: fastest
 *           possible adaptation of old ReadSELEX() to the new
 *           MSA interface.  
 *
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA *  - an alignment object
 *                    caller responsible for an MSAFree()
 *           NULL if no alignment data.          
 */
MSA *
ReadSELEX(MSAFILE *afp)
{
  MSA     *msa;                 /* RETURN: mult seq alignment   */
  FILE    *fp;                  /* ptr to opened seqfile        */
  char   **aseqs;               /* aligned seqs                 */
  int      num = 0;		/* number of seqs read          */
  char     buffer[LINEBUFLEN];	/* input buffer for lines       */
  char     bufcpy[LINEBUFLEN];	/* strtok'able copy of buffer   */
  struct block_struc {          /** alignment data for a block: */
    int lcol;			/* furthest left aligned sym    */
    int rcol;			/* furthest right aligned sym   */
  } *blocks = NULL;
  int      blocknum;		/* number of blocks in file     */
  char    *nptr;                /* ptr to start of name on line */
  char    *sptr;                /* ptr into sequence on line    */
  int      currnum;		/* num. seqs in given block     */
  int      currblock;		/* index for blocks             */
  int      i;			/* loop counter                 */
  int      seqidx;		/* counter for seqs             */
  int      alen;                /* length of alignment          */
  int      warn_names;          /* becomes TRUE if names don't match between blocks */
  int      headnum;		/* seqidx in per-sequence header info */
  int      currlen;
  int      count;
  int      have_cs = 0;
  int      have_rf = 0;
  AINFO    base_ainfo, *ainfo;	/* hack: used to be passed ptr to AINFO */


  /* Convert from MSA interface to what old ReadSELEX() did:
   *     - copy our open fp, rather than opening file
   *     - verify that we're not reading a gzip or stdin
   */
  if (feof(afp->f)) return NULL;
  if (afp->do_gzip || afp->do_stdin)
    Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file"); 
  fp    = afp->f;
  ainfo = &base_ainfo;

  /***************************************************
   * First pass across file. 
   * Count seqs, get names, determine column info
   * Determine what sorts of info are active in this file.
   ***************************************************/

  InitAinfo(ainfo);
				/* get first line of the block 
				 * (non-comment, non-blank) */
  do
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	{ squid_errno = SQERR_NODATA; return 0; }
      strcpy(bufcpy, buffer);
      if (*buffer == '#')
	{
	  if      (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	  else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;
	}
    }
  while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	 (strchr(commentsyms, *nptr) != NULL));

  blocknum   = 0;
  warn_names = FALSE;
  while (!feof(fp))
    {
				/* allocate for info about this block. */
      if (blocknum == 0)
	blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));
      else 
	blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));
      blocks[blocknum].lcol = LINEBUFLEN+1;
      blocks[blocknum].rcol = -1;
	
      currnum = 0;
      while (nptr != NULL)	/* becomes NULL when this block ends. */
      {
				/* First block only: save names */
	if (blocknum == 0)
	  {
	    if (currnum == 0)
	      ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO));
	    else 
	      ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO));

	    ainfo->sqinfo[currnum].flags = 0;
	    SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME);
	  }
	else			/* in each additional block: check names */
	  {
	    if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0)
	      warn_names = TRUE;
	  }
	currnum++;

				/* check rcol, lcol */
	if ((sptr = strtok(NULL, WHITESPACE)) != NULL)
	  {
				/* is this the furthest left we've
				   seen word 2 in this block? */
	    if (sptr - bufcpy < blocks[blocknum].lcol) 
	      blocks[blocknum].lcol = sptr - bufcpy;
				/* look for right side in buffer */
	    for (sptr = buffer + strlen(buffer) - 1;  
		 strchr(WHITESPACE, *sptr) != NULL;
		 sptr --)
	      /* do nothing */ ;
	    if (sptr - buffer > blocks[blocknum].rcol)
	      blocks[blocknum].rcol = sptr - buffer;
	  }

				/* get the next line; blank line means end of block */
	do
	  {
	    if (fgets(buffer, LINEBUFLEN, fp) == NULL) 
	      { nptr = NULL; break; }
	    strcpy(bufcpy, buffer);

	    if      (strncmp(buffer, "#=SS",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS;
	    else if (strncmp(buffer, "#=SA",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA;
	    else if (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	    else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;

	    if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) 
	      break;
	  } while (strchr(commentsyms, *nptr) != NULL);
      }


				/* check that number of sequences matches expected */
      if (blocknum == 0)
	num = currnum;
      else if (currnum != num)
	Die("Parse error in ReadSELEX()");
      blocknum++;

				/* get first line of next block 
				 * (non-comment, non-blank) */
      do
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; }
	  strcpy(bufcpy, buffer);
	}
      while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	     (strchr(commentsyms, *nptr) != NULL));
    }

  
  /***************************************************
   * Get ready for second pass:
   *   figure out the length of the alignment
   *   malloc space
   *   rewind the file
   ***************************************************/

  alen = 0;
  for (currblock = 0; currblock < blocknum; currblock++)
    alen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

  rewind(fp);

  /* allocations. we can't use AllocateAlignment because of
   * the way we already used ainfo->sqinfo.
   */
  aseqs     = (char **) MallocOrDie (num * sizeof(char *));
  if (have_cs) 
    ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char));
  if (have_rf) 
    ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char));

  
  
  for (i = 0; i < num; i++)
    {
      aseqs[i]     = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SS)
	ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SA)
	ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char));
    }
  
  ainfo->alen = alen;
  ainfo->nseq = num; 
  ainfo->wgt  = (float *) MallocOrDie (sizeof(float) * num);
  FSet(ainfo->wgt, num, 1.0);

  /***************************************************
   * Second pass across file. Parse header; assemble sequences
   ***************************************************/
  /* We've now made a complete first pass over the file. We know how
   * many blocks it contains, we know the number of seqs in the first
   * block, and we know every block has the same number of blocks;
   * so we can be a bit more cavalier about error-checking as we
   * make the second pass.
   */

  /* Look for header
   */
  headnum = 0;
  for (;;)
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	Die("Parse error in ReadSELEX()");
      strcpy(bufcpy, buffer);
      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */

      if (strcmp(nptr, "#=AU") == 0  && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->au = Strdup(sptr);
      else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->name = Strdup(sptr);
      else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->acc  = Strdup(sptr);
      else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->desc = Strdup(sptr);
      else if (strcmp(nptr, "#=GA") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga2 = atof(sptr);

	  ainfo->flags |= AINFO_GA;
	}
      else if (strcmp(nptr, "#=TC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc2 = atof(sptr);

	  ainfo->flags |= AINFO_TC;
	}
      else if (strcmp(nptr, "#=NC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc2 = atof(sptr);

	  ainfo->flags |= AINFO_NC;
	}
      else if (strcmp(nptr, "#=SQ") == 0)      /* per-sequence header info */
	{
				/* first field is the name */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE;

				/* second field is the weight */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (!IsReal(sptr)) 
	    Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number");
	  ainfo->wgt[headnum] = atof(sptr);

				/* third field is database source id */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID);

				/* fourth field is database accession number */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC);

				/* fifth field is start..stop::olen */
	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START);

	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP);
	  
	  if ((sptr = strtok(NULL, ":\t ")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN);

				/* rest of line is optional description */
	  if ((sptr = strtok(NULL, "\n")) != NULL)
	    SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC);
	  
	  headnum++;
	}
      else if (strcmp(nptr, "#=CS") == 0) break;
      else if (strcmp(nptr, "#=RF") == 0) break;
      else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */
    }
  

  currlen = 0;
  for (currblock = 0 ; currblock < blocknum; currblock++)
    {
				/* parse the block */
      seqidx = 0;
      while (nptr != NULL)
	{
				/* Consensus structure */
	  if (strcmp(nptr, "#=CS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=CS line in ReadSELEX()");
	    }

				/* Reference coordinates */
	  else if (strcmp(nptr, "#=RF") == 0)
	    {
	      if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=RF line in ReadSELEX()");
	    }
				/* Individual secondary structure */
	  else if (strcmp(nptr, "#=SS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SS line in ReadSELEX()");
	    }

				/* Side chain % surface accessibility code */
	  else if (strcmp(nptr, "#=SA") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SA line in ReadSELEX()");
	    }
				/* Aligned sequence; avoid unparsed machine comments */
	  else if (strncmp(nptr, "#=", 2) != 0)
	    {
	      if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in alignment line in ReadSELEX()");
	      seqidx++;
	    }

				/* get next line */
	  for (;;)
	    {
	      nptr = NULL;
	      if (fgets(buffer, LINEBUFLEN, fp) == NULL) break;	/* EOF */
	      strcpy(bufcpy, buffer);
	      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */
	      if (strncmp(buffer, "#=", 2) == 0) break;      /* machine comment */
	      if (strchr(commentsyms, *nptr) == NULL) break; /* data */
	    }
	} /* end of a block */

      currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

				/* get line 1 of next block */
      for (;;)
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */
	  strcpy(bufcpy, buffer);
	  if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */
	  if (strncmp(buffer, "#=", 2) == 0)       break; /* machine comment */
	  if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */
	}
    } /* end of the file */

  /* Lengths in sqinfo are for raw sequence (ungapped),
   * and SS, SA are 0..rlen-1 not 0..alen-1.
   * Only the seqs with structures come out of here with lengths set.
   */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      int apos, rpos;
				/* secondary structures */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SS)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].ss[rpos] = '\0';
	}
				/* Surface accessibility */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SA)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].sa[rpos] = '\0';
	}
    }

				/* NULL-terminate all the strings */
  if (ainfo->rf != NULL) ainfo->rf[alen] = '\0';
  if (ainfo->cs != NULL) ainfo->cs[alen] = '\0';
  for (seqidx = 0; seqidx < num; seqidx++)
    aseqs[seqidx][alen]            = '\0';
  
				/* find raw sequence lengths for sqinfo */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      count = 0;
      for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++)
	if (!isgap(*sptr)) count++;
      ainfo->sqinfo[seqidx].len    = count;
      ainfo->sqinfo[seqidx].flags |= SQINFO_LEN;
    }


  /***************************************************
   * Garbage collection and return
   ***************************************************/
  free(blocks);
  if (warn_names) 
    Warn("sequences may be in different orders in blocks of %s?", afp->fname);

  /* Convert back to MSA structure. (Wasteful kludge.)
   */
  msa = MSAFromAINFO(aseqs, ainfo);
  MSAVerifyParse(msa);
  FreeAlignment(aseqs, ainfo);
  return msa;
}
Пример #15
0
int main(int argc, char **argv) 
{
  const char      *hmmfile;	/* file to read HMMs from                  */
  HMMFILE         *hmmfp;       /* opened hmmfile for reading              */
  const char      *seqfile;     /* file to read target sequence from       */ 
  char           **rseq;        /* raw, unaligned sequences                */ 
  SQINFO          *sqinfo;      /* info associated with sequences          */
  char           **dsq;         /* digitized raw sequences                 */
  int              nseq;        /* number of sequences                     */  
  char           **aseq;        /* aligned sequences                       */
  AINFO            ainfo;       /* alignment information                   */
  float           *wgt;         /* per-sequence weights                    */
  int              i;
  struct plan7_s    *hmm;       /* HMM to align to                         */ 
  struct p7trace_s **tr;        /* traces for aligned sequences            */

  int   be_quiet;		/* TRUE to suppress verbose banner          */
  int   matchonly;		/* TRUE to show only match state syms       */
  const char *outfile;          /* optional alignment output file           */
  FILE *ofp;                    /* handle on alignment output file          */
  AjPFile ajwithali;          /* name of additional alignment file to align */
  AjPFile ajmapali;           /* name of additional alignment file to map   */
  AjBool ajmatch=ajFalse;
  AjPFile outf=NULL;
  AjPStr  outfname=NULL;
  AjPFile inf=NULL;
  AjPStr  infname=NULL;
  AjPSeqset seqset=NULL;
  AjPStr  ajseqfile=NULL;
  char*  mapali=NULL;
  char*  withali=NULL;
  
#ifdef MEMDEBUG
  unsigned long histid1, histid2, orig_size, current_size;
  orig_size = malloc_inuse(&histid1);
  fprintf(stderr, "[... memory debugging is ON ...]\n");
#endif

  /*********************************************** 
   * Parse command line
   ***********************************************/
  
  matchonly = FALSE;
  outfile   = NULL;
  be_quiet  = FALSE;
  withali   = NULL;
  mapali    = NULL;

  embInitPV("ohmmalign",argc,argv,"HMMER",VERSION);

  ajmatch = ajAcdGetBoolean("matchonly");
  if(ajmatch)
      matchonly=TRUE;
  else
      matchonly=FALSE;



  ajmapali = ajAcdGetInfile("mapalifile");
  if (ajmapali)
      mapali = ajCharNewS(ajFileGetNameS(ajmapali));
  ajFileClose(&ajmapali);
  ajwithali = ajAcdGetInfile("withalifile");
  if (ajwithali)
      withali = ajCharNewS(ajFileGetNameS(ajwithali));
  ajFileClose(&ajwithali);

  be_quiet=TRUE;



  outf = ajAcdGetOutfile("outfile");
  outfname = ajStrNewC((char *)ajFileGetNameC(outf));
  if(*ajStrGetPtr(outfname)>31)
      ajFileClose(&outf);
  outfile = ajStrGetPtr(outfname);

  inf = ajAcdGetInfile("hmmfile");
  infname = ajStrNewC((char *)ajFileGetNameC(inf));
  ajFileClose(&inf);
  hmmfile = ajStrGetPtr(infname);

  
  seqset = ajAcdGetSeqset("sequences");
  ajseqfile = ajStrNewC(ajStrGetPtr(seqset->Filename));
  seqfile = ajStrGetPtr(ajseqfile);
  

 /*********************************************** 
  * Open HMM file (might be in HMMERDB or current directory).
  * Read a single HMM from it.
  * 
  * Currently hmmalign disallows the J state and
  * only allows one domain per sequence. To preserve
  * the S/W entry information, the J state is explicitly
  * disallowed, rather than calling a Plan7*Config() function.
  * this is a workaround in 2.1 for the 2.0.x "yo!" bug.
  ***********************************************/

  if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
    ajFatal("Failed to open HMM file %s\n", hmmfile);
  if (!HMMFileRead(hmmfp, &hmm)) 
    ajFatal("Failed to read any HMMs from %s\n", hmmfile);
  HMMFileClose(hmmfp);
  if (hmm == NULL) 
    ajFatal("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
  hmm->xt[XTE][MOVE] = 1.;	      /* only 1 domain/sequence ("global" alignment) */
  hmm->xt[XTE][LOOP] = 0.;
  P7Logoddsify(hmm, TRUE);
				/* do we have the map we might need? */
  if (mapali != NULL && ! (hmm->flags & PLAN7_MAP))
    ajFatal("HMMER: HMM file %s has no map; you can't use --mapali.", hmmfile);

  /*********************************************** 
   * Open sequence file in current directory.
   * Read all seqs from it.
   ***********************************************/
/*
  if (! SeqfileFormat(seqfile, &format, NULL))
    switch (squid_errno) {
    case SQERR_NOFILE: 
      ajFatal("Sequence file %s could not be opened for reading", seqfile);
    case SQERR_FORMAT: 
    default:           
      ajFatal("Failed to determine format of sequence file %s", seqfile);
    }
  if (! ReadMultipleRseqs(seqfile, format, &rseq, &sqinfo, &nseq))
    ajFatal("Failed to read any sequences from file %s", seqfile);
*/

  emboss_rseqs(seqset,&rseq,&sqinfo,&nseq);

  /*********************************************** 
   * Show the banner
   ***********************************************/

  be_quiet=TRUE;
  if (! be_quiet) 
    {
/*      Banner(stdout, banner); */
      printf(   "HMM file:             %s\n", hmmfile);
      printf(   "Sequence file:        %s\n", seqfile);
      printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");
    }

  /*********************************************** 
   * Do the work
   ***********************************************/

  /* Allocations and initializations.
   */
  dsq = MallocOrDie(sizeof(char *) * nseq);
  tr  = MallocOrDie(sizeof(struct p7trace_s *) * nseq);

  /* Align each sequence to the model, collect traces
   */
  for (i = 0; i < nseq; i++)
    {
      dsq[i] = DigitizeSequence(rseq[i], sqinfo[i].len);

      if (P7ViterbiSize(sqinfo[i].len, hmm->M) <= RAMLIMIT)
	(void) P7Viterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
      else
	(void) P7SmallViterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
    }

  /* Include an aligned alignment, if desired.
   */
  if (mapali != NULL)
    include_alignment(mapali, hmm, TRUE, &rseq, &dsq, &sqinfo, &tr, &nseq);
  if (withali != NULL) 
    include_alignment(withali, hmm, FALSE, &rseq, &dsq, &sqinfo, &tr, &nseq);

  /* Turn traces into a multiple alignment
   */ 
  wgt = MallocOrDie(sizeof(float) * nseq);
  FSet(wgt, nseq, 1.0);
  P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, matchonly,
		     &aseq, &ainfo);

  /*********************************************** 
   * Output the alignment
   ***********************************************/

  if (outfile != NULL && (ofp = fopen(outfile, "w")) != NULL)
    {
      WriteSELEX(ofp, aseq, &ainfo, 50);
      printf("Alignment saved in file %s\n", outfile);
      fclose(ofp);
    }
  else
    WriteSELEX(stdout, aseq, &ainfo, 50);

  /*********************************************** 
   * Cleanup and exit
   ***********************************************/
  
  for (i = 0; i < nseq; i++) 
    {
      P7FreeTrace(tr[i]);
      FreeSequence(rseq[i], &(sqinfo[i]));
      free(dsq[i]);
    }
  FreeAlignment(aseq, &ainfo);
  FreePlan7(hmm);
  free(sqinfo);
  free(rseq);
  free(dsq);
  free(wgt);
  free(tr);

  SqdClean();

  ajStrDel(&outfname);
  ajStrDel(&infname);
  ajStrDel(&ajseqfile);
  

#ifdef MEMDEBUG
  current_size = malloc_inuse(&histid2);
  if (current_size != orig_size) malloc_list(2, histid1, histid2);
  else fprintf(stderr, "[No memory leaks.]\n");
#endif

  ajSeqsetDel(&seqset);
  ajFileClose(&ajwithali);
  ajFileClose(&ajmapali);

  embExit();
  
  return 0;
}