Exemple #1
0
/* Function: ReadMultipleRseqs()
 * 
 * Purpose:  Open a data file and
 *           parse it into an array of rseqs (raw, unaligned
 *           sequences).
 * 
 *           Caller is responsible for free'ing memory allocated
 *           to ret_rseqs, ret_weights, and ret_names.
 *           
 *           Weights are currently only supported for MSF format.
 *           Sequences read from all other formats will be assigned
 *           weights of 1.0. If the caller isn't interested in
 *           weights, it passes NULL as ret_weights.
 * 
 * Returns 1 on success. Returns 0 on failure and sets
 * squid_errno to indicate the cause.
 */
int
ReadMultipleRseqs(char              *seqfile,
		  int                fformat,
		  char            ***ret_rseqs,
		  SQINFO **ret_sqinfo,
		  int               *ret_num)
{
  SQINFO *sqinfo;               /* array of sequence optional info         */
  SQFILE *dbfp;                 /* open ptr for sequential access of file  */
  char  **rseqs;                /* sequence array                          */
  char  **aseqs;                /* aligned sequences, if file is aligned   */
  AINFO   ainfo;      /* alignment-associated information        */
  int     numalloced;           /* num of seqs currently alloced for       */
  int     idx;
  int     num;

  if (fformat == kSelex || fformat == kMSF || fformat == kClustal)
    {
      if (! ReadAlignment(seqfile, fformat, &aseqs, &ainfo)) return 0;
      if (! DealignAseqs(aseqs, ainfo.nseq, &rseqs))                return 0;

      /* copy the sqinfo array
       */
      num = ainfo.nseq;
      sqinfo= (SQINFO *) MallocOrDie (sizeof(SQINFO)*ainfo.nseq);
      for (idx = 0; idx < ainfo.nseq; idx++)
	SeqinfoCopy(&(sqinfo[idx]), &(ainfo.sqinfo[idx]));
      FreeAlignment(aseqs, &ainfo);
    }
  else
    {
				/* initial alloc */
      num        = 0;
      numalloced = 16;
      rseqs  = (char **) MallocOrDie (numalloced * sizeof(char *));
      sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO));
      if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0;      

      while (ReadSeq(dbfp, fformat, &rseqs[num], &(sqinfo[num])))
	{
	  num++;
	  if (num == numalloced) /* more seqs coming, alloc more room */
	    {
	      numalloced += 16;
	      rseqs  = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *));
	      sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO));
	    }
	}
      SeqfileClose(dbfp);
    }

  *ret_rseqs  = rseqs;
  *ret_sqinfo = sqinfo;
  *ret_num    = num;
  return 1;
}
Exemple #2
0
void
SeqfileClose(SQFILE *sqfp)
{
				/* free static ptrs if we used them */
  if (sqfp->ali_aseqs != NULL) 
    FreeAlignment(sqfp->ali_aseqs, &(sqfp->ali_ainfo));

  if (sqfp->do_gzip)         pclose(sqfp->f);
  else if (! sqfp->do_stdin) fclose(sqfp->f);
  free(sqfp);
}
Exemple #3
0
int main(int argc, char **argv) 
{
    const char      *hmmfile;	/* file to read HMMs from                  */
    FILE            *fp;	/* output file handle                      */
    HMMFILE         *hmmfp;	/* opened hmmfile for reading              */
    struct plan7_s  *hmm;	/* HMM to generate from                    */
    int              L;		/* length of a sequence                    */
    int              i;		/* counter over sequences                  */

    char            *ofile;	/* output sequence file                    */
    int              nseq;	/* number of seqs to sample                */
    int              seed;	/* random number generator seed            */
    int              be_quiet;	/* TRUE to silence header/footer           */
    int              do_alignment; /* TRUE to output in aligned format     */ 
    int              do_consensus; /* TRUE to do a single consensus seq    */

    AjBool ajselex;
    AjBool ajcons;
    AjPFile inf=NULL;
    AjPFile outf=NULL;
    AjPStr  instr=NULL;
    AjPStr  outstr=NULL;
  

#ifdef MEMDEBUG
    unsigned long histid1, histid2, orig_size, current_size;
    orig_size = malloc_inuse(&histid1);
    fprintf(stderr, "[... memory debugging is ON ...]\n");
#endif

    /*********************************************** 
     * Parse command line
     ***********************************************/

    nseq         = 10;

    be_quiet     = FALSE;
    do_alignment = FALSE;  
    do_consensus = FALSE;
    ofile        = NULL;

    embInitPV("ohmmemit",argc,argv,"HMMER",VERSION);

    ajselex = ajAcdGetBoolean("selex");
    ajcons  = ajAcdGetBoolean("consensus");
    nseq    = ajAcdGetInt("number");
    seed    = ajAcdGetInt("seed");
    inf     = ajAcdGetInfile("infile");
    outf    = ajAcdGetOutfile("outfile");
  
    if(!seed)
	seed = time ((time_t *) NULL);

    if(ajselex)
	do_alignment=TRUE;
    else
	do_alignment=FALSE;
  
    if(ajcons)
	do_consensus=TRUE;
    else
	do_consensus=FALSE;

    instr  = ajStrNewC((char *)ajFileGetNameC(inf));
    outstr = ajStrNewC((char *)ajFileGetNameC(outf));

    hmmfile = ajStrGetPtr(instr);

    sre_srandom(seed);

    if (do_alignment && do_consensus)
	ajFatal("Sorry, -selex and -consensus are incompatible.\n"); 
    if (nseq != 10 && do_consensus)
	ajWarn("-consensus overrides -number (# of sampled seqs)");

    /*********************************************** 
     * Open HMM file (might be in HMMERDB or current directory).
     * Read a single HMM from it.
     ***********************************************/

    if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
	ajFatal("Failed to open HMM file %s\n", hmmfile);
    if (!HMMFileRead(hmmfp, &hmm)) 
	ajFatal("Failed to read any HMMs from %s\n", hmmfile);
    HMMFileClose(hmmfp);
    if (hmm == NULL) 
	ajFatal("HMM file %s corrupt or in incorrect format? Parse failed",
		hmmfile);

    /* Configure the HMM to shut off N,J,C emission: so we
     * do a simple single pass through the model.
     */
    Plan7NakedConfig(hmm);
    Plan7Renormalize(hmm);

    /*********************************************** 
     * Open the output file, or stdout
     ***********************************************/ 

    fp = ajFileGetFileptr(outf);
  
 
    /*********************************************** 
     * Show the options banner
     ***********************************************/
    be_quiet=TRUE;
    if (! be_quiet) 
    {
	printf("HMM file:             %s\n", hmmfile);
	if (! do_consensus)
	{
	    printf("Number of seqs:       %d\n", nseq);
	    printf("Random seed:          %d\n", seed);
	}
	printf("- - - - - - - - - - - - - - - - - - - - - - - - - "
	       "- - - - - - -\n\n");
    }

    /*********************************************** 
     * Do the work.
     * If we're generating an alignment, we have to collect
     * all our traces, then output. If we're generating unaligned
     * sequences, we can emit one at a time.
     ***********************************************/

    if (do_consensus) 
    {
	char    *seq;
	SQINFO   sqinfo;	/* info about sequence (name/desc)        */

	EmitConsensusSequence(hmm, &seq, NULL, &L, NULL);
	strcpy(sqinfo.name, "consensus");
	sqinfo.len = L;
	sqinfo.flags = SQINFO_NAME | SQINFO_LEN;

	WriteSeq(fp, kPearson, seq, &sqinfo);
	free(seq);
    }
    else if (do_alignment)
    {
	struct p7trace_s **tr;
	char           **dsq;
	SQINFO          *sqinfo;
	char           **aseq;
	AINFO            ainfo;
	float           *wgt;

	dsq    = MallocOrDie(sizeof(char *)             * nseq);
	tr     = MallocOrDie(sizeof(struct p7trace_s *) * nseq);
	sqinfo = MallocOrDie(sizeof(SQINFO)             * nseq);
	wgt    = MallocOrDie(sizeof(float)              * nseq);
	FSet(wgt, nseq, 1.0);

	for (i = 0; i < nseq; i++)
	{
	    EmitSequence(hmm, &(dsq[i]), &L, &(tr[i]));
	    sprintf(sqinfo[i].name, "seq%d", i+1);
	    sqinfo[i].len   = L;
	    sqinfo[i].flags = SQINFO_NAME | SQINFO_LEN;
	}

	P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, FALSE, 
			   &aseq, &ainfo);

	/* Output the alignment */
	WriteSELEX(fp, aseq, &ainfo, 50);
	if (ofile != NULL && !be_quiet)
	    printf("Alignment saved in file %s\n", ofile);

	/* Free memory
	 */
	for (i = 0; i < nseq; i++) 
	{
	    P7FreeTrace(tr[i]);
	    free(dsq[i]);
	}
	FreeAlignment(aseq, &ainfo);
	free(sqinfo);
	free(dsq);
	free(wgt);
	free(tr);
    }
    else				/* unaligned sequence output */
    {
	struct p7trace_s *tr;
	char             *dsq;
	char             *seq;
	SQINFO            sqinfo;

	for (i = 0; i < nseq; i++)
	{
	    EmitSequence(hmm, &dsq, &L, &tr);
	    sprintf(sqinfo.name, "seq%d", i+1);
	    sqinfo.len   = L;
	    sqinfo.flags = SQINFO_NAME | SQINFO_LEN;

	    seq = DedigitizeSequence(dsq, L);

	    WriteSeq(fp, kPearson, seq, &sqinfo);
	  
	    P7FreeTrace(tr);
	    free(dsq);
	    free(seq);
	}
    }

    ajFileClose(&outf);
  
    FreePlan7(hmm);
    SqdClean();

#ifdef MEMDEBUG
    current_size = malloc_inuse(&histid2);
    if (current_size != orig_size)
	malloc_list(2, histid1, histid2);
    else
	fprintf(stderr, "[No memory leaks.]\n");
#endif


    ajStrDel(&instr);
    ajStrDel(&outstr);
    ajFileClose(&inf);
    ajFileClose(&outf);

    embExit();
    return 0;
}
Exemple #4
0
/* Function: ReadSELEX()
 * Date:     SRE, Sun Jun  6 18:24:09 1999 [St. Louis]
 *
 * Purpose:  Parse an alignment read from an open SELEX format
 *           alignment file. (SELEX is a single alignment format).
 *           Return the alignment, or NULL if we've already read the
 *           alignment or there's no alignment data in the file.
 *           
 * Limitations: SELEX is the only remaining multipass parser for
 *           alignment files. It cannot read from gzip or from stdin.
 *           It Die()'s here if you try. The reason for this
 *           that SELEX allows space characters as gaps, so we don't
 *           know the borders of an alignment block until we've seen
 *           the whole block. I could rewrite to allow single-pass
 *           parsing (by storing the whole block in memory) but
 *           since SELEX is now legacy, why bother.
 *           
 *           Note that the interface is totally kludged: fastest
 *           possible adaptation of old ReadSELEX() to the new
 *           MSA interface.  
 *
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA *  - an alignment object
 *                    caller responsible for an MSAFree()
 *           NULL if no alignment data.          
 */
MSA *
ReadSELEX(MSAFILE *afp)
{
  MSA     *msa;                 /* RETURN: mult seq alignment   */
  FILE    *fp;                  /* ptr to opened seqfile        */
  char   **aseqs;               /* aligned seqs                 */
  int      num = 0;		/* number of seqs read          */
  char     buffer[LINEBUFLEN];	/* input buffer for lines       */
  char     bufcpy[LINEBUFLEN];	/* strtok'able copy of buffer   */
  struct block_struc {          /** alignment data for a block: */
    int lcol;			/* furthest left aligned sym    */
    int rcol;			/* furthest right aligned sym   */
  } *blocks = NULL;
  int      blocknum;		/* number of blocks in file     */
  char    *nptr;                /* ptr to start of name on line */
  char    *sptr;                /* ptr into sequence on line    */
  int      currnum;		/* num. seqs in given block     */
  int      currblock;		/* index for blocks             */
  int      i;			/* loop counter                 */
  int      seqidx;		/* counter for seqs             */
  int      alen;                /* length of alignment          */
  int      warn_names;          /* becomes TRUE if names don't match between blocks */
  int      headnum;		/* seqidx in per-sequence header info */
  int      currlen;
  int      count;
  int      have_cs = 0;
  int      have_rf = 0;
  AINFO    base_ainfo, *ainfo;	/* hack: used to be passed ptr to AINFO */


  /* Convert from MSA interface to what old ReadSELEX() did:
   *     - copy our open fp, rather than opening file
   *     - verify that we're not reading a gzip or stdin
   */
  if (feof(afp->f)) return NULL;
  if (afp->do_gzip || afp->do_stdin)
    Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file"); 
  fp    = afp->f;
  ainfo = &base_ainfo;

  /***************************************************
   * First pass across file. 
   * Count seqs, get names, determine column info
   * Determine what sorts of info are active in this file.
   ***************************************************/

  InitAinfo(ainfo);
				/* get first line of the block 
				 * (non-comment, non-blank) */
  do
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	{ squid_errno = SQERR_NODATA; return 0; }
      strcpy(bufcpy, buffer);
      if (*buffer == '#')
	{
	  if      (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	  else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;
	}
    }
  while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	 (strchr(commentsyms, *nptr) != NULL));

  blocknum   = 0;
  warn_names = FALSE;
  while (!feof(fp))
    {
				/* allocate for info about this block. */
      if (blocknum == 0)
	blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));
      else 
	blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));
      blocks[blocknum].lcol = LINEBUFLEN+1;
      blocks[blocknum].rcol = -1;
	
      currnum = 0;
      while (nptr != NULL)	/* becomes NULL when this block ends. */
      {
				/* First block only: save names */
	if (blocknum == 0)
	  {
	    if (currnum == 0)
	      ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO));
	    else 
	      ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO));

	    ainfo->sqinfo[currnum].flags = 0;
	    SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME);
	  }
	else			/* in each additional block: check names */
	  {
	    if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0)
	      warn_names = TRUE;
	  }
	currnum++;

				/* check rcol, lcol */
	if ((sptr = strtok(NULL, WHITESPACE)) != NULL)
	  {
				/* is this the furthest left we've
				   seen word 2 in this block? */
	    if (sptr - bufcpy < blocks[blocknum].lcol) 
	      blocks[blocknum].lcol = sptr - bufcpy;
				/* look for right side in buffer */
	    for (sptr = buffer + strlen(buffer) - 1;  
		 strchr(WHITESPACE, *sptr) != NULL;
		 sptr --)
	      /* do nothing */ ;
	    if (sptr - buffer > blocks[blocknum].rcol)
	      blocks[blocknum].rcol = sptr - buffer;
	  }

				/* get the next line; blank line means end of block */
	do
	  {
	    if (fgets(buffer, LINEBUFLEN, fp) == NULL) 
	      { nptr = NULL; break; }
	    strcpy(bufcpy, buffer);

	    if      (strncmp(buffer, "#=SS",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS;
	    else if (strncmp(buffer, "#=SA",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA;
	    else if (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	    else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;

	    if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) 
	      break;
	  } while (strchr(commentsyms, *nptr) != NULL);
      }


				/* check that number of sequences matches expected */
      if (blocknum == 0)
	num = currnum;
      else if (currnum != num)
	Die("Parse error in ReadSELEX()");
      blocknum++;

				/* get first line of next block 
				 * (non-comment, non-blank) */
      do
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; }
	  strcpy(bufcpy, buffer);
	}
      while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	     (strchr(commentsyms, *nptr) != NULL));
    }

  
  /***************************************************
   * Get ready for second pass:
   *   figure out the length of the alignment
   *   malloc space
   *   rewind the file
   ***************************************************/

  alen = 0;
  for (currblock = 0; currblock < blocknum; currblock++)
    alen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

  rewind(fp);

  /* allocations. we can't use AllocateAlignment because of
   * the way we already used ainfo->sqinfo.
   */
  aseqs     = (char **) MallocOrDie (num * sizeof(char *));
  if (have_cs) 
    ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char));
  if (have_rf) 
    ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char));

  
  
  for (i = 0; i < num; i++)
    {
      aseqs[i]     = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SS)
	ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SA)
	ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char));
    }
  
  ainfo->alen = alen;
  ainfo->nseq = num; 
  ainfo->wgt  = (float *) MallocOrDie (sizeof(float) * num);
  FSet(ainfo->wgt, num, 1.0);

  /***************************************************
   * Second pass across file. Parse header; assemble sequences
   ***************************************************/
  /* We've now made a complete first pass over the file. We know how
   * many blocks it contains, we know the number of seqs in the first
   * block, and we know every block has the same number of blocks;
   * so we can be a bit more cavalier about error-checking as we
   * make the second pass.
   */

  /* Look for header
   */
  headnum = 0;
  for (;;)
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	Die("Parse error in ReadSELEX()");
      strcpy(bufcpy, buffer);
      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */

      if (strcmp(nptr, "#=AU") == 0  && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->au = Strdup(sptr);
      else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->name = Strdup(sptr);
      else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->acc  = Strdup(sptr);
      else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->desc = Strdup(sptr);
      else if (strcmp(nptr, "#=GA") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga2 = atof(sptr);

	  ainfo->flags |= AINFO_GA;
	}
      else if (strcmp(nptr, "#=TC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc2 = atof(sptr);

	  ainfo->flags |= AINFO_TC;
	}
      else if (strcmp(nptr, "#=NC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc2 = atof(sptr);

	  ainfo->flags |= AINFO_NC;
	}
      else if (strcmp(nptr, "#=SQ") == 0)      /* per-sequence header info */
	{
				/* first field is the name */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE;

				/* second field is the weight */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (!IsReal(sptr)) 
	    Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number");
	  ainfo->wgt[headnum] = atof(sptr);

				/* third field is database source id */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID);

				/* fourth field is database accession number */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC);

				/* fifth field is start..stop::olen */
	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START);

	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP);
	  
	  if ((sptr = strtok(NULL, ":\t ")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN);

				/* rest of line is optional description */
	  if ((sptr = strtok(NULL, "\n")) != NULL)
	    SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC);
	  
	  headnum++;
	}
      else if (strcmp(nptr, "#=CS") == 0) break;
      else if (strcmp(nptr, "#=RF") == 0) break;
      else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */
    }
  

  currlen = 0;
  for (currblock = 0 ; currblock < blocknum; currblock++)
    {
				/* parse the block */
      seqidx = 0;
      while (nptr != NULL)
	{
				/* Consensus structure */
	  if (strcmp(nptr, "#=CS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=CS line in ReadSELEX()");
	    }

				/* Reference coordinates */
	  else if (strcmp(nptr, "#=RF") == 0)
	    {
	      if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=RF line in ReadSELEX()");
	    }
				/* Individual secondary structure */
	  else if (strcmp(nptr, "#=SS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SS line in ReadSELEX()");
	    }

				/* Side chain % surface accessibility code */
	  else if (strcmp(nptr, "#=SA") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SA line in ReadSELEX()");
	    }
				/* Aligned sequence; avoid unparsed machine comments */
	  else if (strncmp(nptr, "#=", 2) != 0)
	    {
	      if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in alignment line in ReadSELEX()");
	      seqidx++;
	    }

				/* get next line */
	  for (;;)
	    {
	      nptr = NULL;
	      if (fgets(buffer, LINEBUFLEN, fp) == NULL) break;	/* EOF */
	      strcpy(bufcpy, buffer);
	      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */
	      if (strncmp(buffer, "#=", 2) == 0) break;      /* machine comment */
	      if (strchr(commentsyms, *nptr) == NULL) break; /* data */
	    }
	} /* end of a block */

      currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

				/* get line 1 of next block */
      for (;;)
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */
	  strcpy(bufcpy, buffer);
	  if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */
	  if (strncmp(buffer, "#=", 2) == 0)       break; /* machine comment */
	  if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */
	}
    } /* end of the file */

  /* Lengths in sqinfo are for raw sequence (ungapped),
   * and SS, SA are 0..rlen-1 not 0..alen-1.
   * Only the seqs with structures come out of here with lengths set.
   */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      int apos, rpos;
				/* secondary structures */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SS)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].ss[rpos] = '\0';
	}
				/* Surface accessibility */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SA)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].sa[rpos] = '\0';
	}
    }

				/* NULL-terminate all the strings */
  if (ainfo->rf != NULL) ainfo->rf[alen] = '\0';
  if (ainfo->cs != NULL) ainfo->cs[alen] = '\0';
  for (seqidx = 0; seqidx < num; seqidx++)
    aseqs[seqidx][alen]            = '\0';
  
				/* find raw sequence lengths for sqinfo */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      count = 0;
      for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++)
	if (!isgap(*sptr)) count++;
      ainfo->sqinfo[seqidx].len    = count;
      ainfo->sqinfo[seqidx].flags |= SQINFO_LEN;
    }


  /***************************************************
   * Garbage collection and return
   ***************************************************/
  free(blocks);
  if (warn_names) 
    Warn("sequences may be in different orders in blocks of %s?", afp->fname);

  /* Convert back to MSA structure. (Wasteful kludge.)
   */
  msa = MSAFromAINFO(aseqs, ainfo);
  MSAVerifyParse(msa);
  FreeAlignment(aseqs, ainfo);
  return msa;
}
Exemple #5
0
int main(int argc, char **argv) 
{
  const char      *hmmfile;	/* file to read HMMs from                  */
  HMMFILE         *hmmfp;       /* opened hmmfile for reading              */
  const char      *seqfile;     /* file to read target sequence from       */ 
  char           **rseq;        /* raw, unaligned sequences                */ 
  SQINFO          *sqinfo;      /* info associated with sequences          */
  char           **dsq;         /* digitized raw sequences                 */
  int              nseq;        /* number of sequences                     */  
  char           **aseq;        /* aligned sequences                       */
  AINFO            ainfo;       /* alignment information                   */
  float           *wgt;         /* per-sequence weights                    */
  int              i;
  struct plan7_s    *hmm;       /* HMM to align to                         */ 
  struct p7trace_s **tr;        /* traces for aligned sequences            */

  int   be_quiet;		/* TRUE to suppress verbose banner          */
  int   matchonly;		/* TRUE to show only match state syms       */
  const char *outfile;          /* optional alignment output file           */
  FILE *ofp;                    /* handle on alignment output file          */
  AjPFile ajwithali;          /* name of additional alignment file to align */
  AjPFile ajmapali;           /* name of additional alignment file to map   */
  AjBool ajmatch=ajFalse;
  AjPFile outf=NULL;
  AjPStr  outfname=NULL;
  AjPFile inf=NULL;
  AjPStr  infname=NULL;
  AjPSeqset seqset=NULL;
  AjPStr  ajseqfile=NULL;
  char*  mapali=NULL;
  char*  withali=NULL;
  
#ifdef MEMDEBUG
  unsigned long histid1, histid2, orig_size, current_size;
  orig_size = malloc_inuse(&histid1);
  fprintf(stderr, "[... memory debugging is ON ...]\n");
#endif

  /*********************************************** 
   * Parse command line
   ***********************************************/
  
  matchonly = FALSE;
  outfile   = NULL;
  be_quiet  = FALSE;
  withali   = NULL;
  mapali    = NULL;

  embInitPV("ohmmalign",argc,argv,"HMMER",VERSION);

  ajmatch = ajAcdGetBoolean("matchonly");
  if(ajmatch)
      matchonly=TRUE;
  else
      matchonly=FALSE;



  ajmapali = ajAcdGetInfile("mapalifile");
  if (ajmapali)
      mapali = ajCharNewS(ajFileGetNameS(ajmapali));
  ajFileClose(&ajmapali);
  ajwithali = ajAcdGetInfile("withalifile");
  if (ajwithali)
      withali = ajCharNewS(ajFileGetNameS(ajwithali));
  ajFileClose(&ajwithali);

  be_quiet=TRUE;



  outf = ajAcdGetOutfile("outfile");
  outfname = ajStrNewC((char *)ajFileGetNameC(outf));
  if(*ajStrGetPtr(outfname)>31)
      ajFileClose(&outf);
  outfile = ajStrGetPtr(outfname);

  inf = ajAcdGetInfile("hmmfile");
  infname = ajStrNewC((char *)ajFileGetNameC(inf));
  ajFileClose(&inf);
  hmmfile = ajStrGetPtr(infname);

  
  seqset = ajAcdGetSeqset("sequences");
  ajseqfile = ajStrNewC(ajStrGetPtr(seqset->Filename));
  seqfile = ajStrGetPtr(ajseqfile);
  

 /*********************************************** 
  * Open HMM file (might be in HMMERDB or current directory).
  * Read a single HMM from it.
  * 
  * Currently hmmalign disallows the J state and
  * only allows one domain per sequence. To preserve
  * the S/W entry information, the J state is explicitly
  * disallowed, rather than calling a Plan7*Config() function.
  * this is a workaround in 2.1 for the 2.0.x "yo!" bug.
  ***********************************************/

  if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
    ajFatal("Failed to open HMM file %s\n", hmmfile);
  if (!HMMFileRead(hmmfp, &hmm)) 
    ajFatal("Failed to read any HMMs from %s\n", hmmfile);
  HMMFileClose(hmmfp);
  if (hmm == NULL) 
    ajFatal("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
  hmm->xt[XTE][MOVE] = 1.;	      /* only 1 domain/sequence ("global" alignment) */
  hmm->xt[XTE][LOOP] = 0.;
  P7Logoddsify(hmm, TRUE);
				/* do we have the map we might need? */
  if (mapali != NULL && ! (hmm->flags & PLAN7_MAP))
    ajFatal("HMMER: HMM file %s has no map; you can't use --mapali.", hmmfile);

  /*********************************************** 
   * Open sequence file in current directory.
   * Read all seqs from it.
   ***********************************************/
/*
  if (! SeqfileFormat(seqfile, &format, NULL))
    switch (squid_errno) {
    case SQERR_NOFILE: 
      ajFatal("Sequence file %s could not be opened for reading", seqfile);
    case SQERR_FORMAT: 
    default:           
      ajFatal("Failed to determine format of sequence file %s", seqfile);
    }
  if (! ReadMultipleRseqs(seqfile, format, &rseq, &sqinfo, &nseq))
    ajFatal("Failed to read any sequences from file %s", seqfile);
*/

  emboss_rseqs(seqset,&rseq,&sqinfo,&nseq);

  /*********************************************** 
   * Show the banner
   ***********************************************/

  be_quiet=TRUE;
  if (! be_quiet) 
    {
/*      Banner(stdout, banner); */
      printf(   "HMM file:             %s\n", hmmfile);
      printf(   "Sequence file:        %s\n", seqfile);
      printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");
    }

  /*********************************************** 
   * Do the work
   ***********************************************/

  /* Allocations and initializations.
   */
  dsq = MallocOrDie(sizeof(char *) * nseq);
  tr  = MallocOrDie(sizeof(struct p7trace_s *) * nseq);

  /* Align each sequence to the model, collect traces
   */
  for (i = 0; i < nseq; i++)
    {
      dsq[i] = DigitizeSequence(rseq[i], sqinfo[i].len);

      if (P7ViterbiSize(sqinfo[i].len, hmm->M) <= RAMLIMIT)
	(void) P7Viterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
      else
	(void) P7SmallViterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
    }

  /* Include an aligned alignment, if desired.
   */
  if (mapali != NULL)
    include_alignment(mapali, hmm, TRUE, &rseq, &dsq, &sqinfo, &tr, &nseq);
  if (withali != NULL) 
    include_alignment(withali, hmm, FALSE, &rseq, &dsq, &sqinfo, &tr, &nseq);

  /* Turn traces into a multiple alignment
   */ 
  wgt = MallocOrDie(sizeof(float) * nseq);
  FSet(wgt, nseq, 1.0);
  P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, matchonly,
		     &aseq, &ainfo);

  /*********************************************** 
   * Output the alignment
   ***********************************************/

  if (outfile != NULL && (ofp = fopen(outfile, "w")) != NULL)
    {
      WriteSELEX(ofp, aseq, &ainfo, 50);
      printf("Alignment saved in file %s\n", outfile);
      fclose(ofp);
    }
  else
    WriteSELEX(stdout, aseq, &ainfo, 50);

  /*********************************************** 
   * Cleanup and exit
   ***********************************************/
  
  for (i = 0; i < nseq; i++) 
    {
      P7FreeTrace(tr[i]);
      FreeSequence(rseq[i], &(sqinfo[i]));
      free(dsq[i]);
    }
  FreeAlignment(aseq, &ainfo);
  FreePlan7(hmm);
  free(sqinfo);
  free(rseq);
  free(dsq);
  free(wgt);
  free(tr);

  SqdClean();

  ajStrDel(&outfname);
  ajStrDel(&infname);
  ajStrDel(&ajseqfile);
  

#ifdef MEMDEBUG
  current_size = malloc_inuse(&histid2);
  if (current_size != orig_size) malloc_list(2, histid1, histid2);
  else fprintf(stderr, "[No memory leaks.]\n");
#endif

  ajSeqsetDel(&seqset);
  ajFileClose(&ajwithali);
  ajFileClose(&ajmapali);

  embExit();
  
  return 0;
}
Exemple #6
0
/* Function: include_alignment()
 * Date:     SRE, Sun Jul  5 15:25:13 1998 [St. Louis]
 *
 * Purpose:  Given the name of a multiple alignment file,
 *           align that alignment to the HMM, and add traces
 *           to an existing array of traces. If do_mapped
 *           is TRUE, we use the HMM's map file. If not,
 *           we use P7ViterbiAlignAlignment().
 *
 * Args:     seqfile  - name of alignment file
 *           hmm      - model to align to
 *           do_mapped- TRUE if we're to use the HMM's alignment map
 *           rsq      - RETURN: array of rseqs to add to
 *           dsq      - RETURN: array of dsq to add to
 *           sqinfo   - RETURN: array of SQINFO to add to
 *           tr       - RETURN: array of traces to add to
 *           nseq     - RETURN: number of seqs           
 *
 * Returns:  new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is
 *           increased to nseq+ainfo.nseq.
 */
void
include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped,
		  char ***rsq, char ***dsq, SQINFO **sqinfo, 
		  struct p7trace_s ***tr, int *nseq)
{
  int format;			/* format of alignment file */
  char **aseq;			/* aligned seqs             */
  char **newdsq;
  char **newrseq;
  AINFO ainfo;			/* info that goes with aseq */
  int   idx;			/* counter over aseqs       */
  struct p7trace_s *master;     /* master trace             */
  struct p7trace_s **addtr;     /* individual traces for aseq */

  if (! SeqfileFormat(seqfile, &format, NULL))
    switch (squid_errno) {
    case SQERR_NOFILE: 
      ajFatal("Alignment file %s could not be opened for reading", seqfile);
      /*FALLTHRU*/ /* a white lie to shut lint up */
    case SQERR_FORMAT: 
    default:           
      ajFatal("Failed to determine format of alignment file %s", seqfile);
    }
				/* read the alignment from file */
  if (! ReadAlignment(seqfile, format, &aseq, &ainfo))
    ajFatal("Failed to read aligned sequence file %s", seqfile);
  for (idx = 0; idx < ainfo.nseq; idx++)
    s2upper(aseq[idx]);
				/* Verify checksums before mapping */
  if (do_mapped && GCGMultchecksum(aseq, ainfo.nseq) != hmm->checksum)
    ajFatal("The checksums for alignment file %s and the HMM alignment map don't match.", 
	seqfile);
				/* Get a master trace */
  if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, ainfo.alen);
  else           master = P7ViterbiAlignAlignment(aseq, &ainfo, hmm);

				/* convert to individual traces */
  ImposeMasterTrace(aseq, ainfo.nseq, master, &addtr);
				/* add those traces to existing ones */
  *tr = MergeTraceArrays(*tr, *nseq, addtr, ainfo.nseq);
  
				/* additional bookkeeping: add to dsq, sqinfo */
  *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DealignAseqs(aseq, ainfo.nseq, &newrseq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*rsq)[idx] = newrseq[idx - (*nseq)];
  free(newrseq);

  *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DigitizeAlignment(aseq, &ainfo, &newdsq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*dsq)[idx] = newdsq[idx - (*nseq)];
  free(newdsq);
				/* unnecessarily complex, but I can't be bothered... */
  *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + ainfo.nseq));
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    SeqinfoCopy(&((*sqinfo)[idx]), &(ainfo.sqinfo[idx - (*nseq)]));
  
  *nseq = *nseq + ainfo.nseq;

				/* Cleanup */
  P7FreeTrace(master);
  FreeAlignment(aseq, &ainfo);
				/* Return */
  return;
}