Ejemplo n.º 1
0
/* Function: ReadPhylip()
 * Date:     SRE, Fri Jun 18 12:59:37 1999 [Sanger Centre]
 *
 * Purpose:  Parse an alignment from an open Phylip format
 *           alignment file. Phylip is a single-alignment format.
 *           Return the alignment, or NULL if we have no data.
 *
 * Args:     afp - open alignment file
 *
 * Returns:  MSA * - an alignment object
 *                   Caller responsible for an MSAFree()
 *           NULL if no more alignments        
 */
MSA *
ReadPhylip(MSAFILE *afp)
{
  MSA  *msa;
  char *s, *s1, *s2;
  char  name[11];		/* seq name max len = 10 char */
  int   nseq, alen;
  int   idx;			/* index of current sequence */
  int   slen;
  int   nblock;
  
  if (feof(afp->f)) return NULL;

  /* Skip until we see a nonblank line; it's the header,
   * containing nseq/alen
   */
  nseq = 0; alen = 0;
  while ((s = MSAFileGetLine(afp)) != NULL)
    {
      if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue;
      if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL)
	Die("Failed to parse nseq/alen from first line of PHYLIP file %s\n", afp->fname);
      if (! IsInt(s1) || ! IsInt(s2))
	Die("nseq and/or alen not an integer in first line of PHYLIP file %s\n", afp->fname);
      nseq = atoi(s1);
      alen = atoi(s2);
      break;
    }

  msa = MSAAlloc(nseq, 0);
  idx    = 0;
  nblock = 0;
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      /* ignore blank lines. nonblank lines start w/ nonblank char */
      if (isspace(*s)) continue;
				/* First block has seq names */
      if (nblock == 0) {
	strncpy(name, s, 10);
	name[10] = '\0';
	GKIStoreKey(msa->index, name);
	msa->sqname[idx] = sre_strdup(name, -1);
	s += 10;		
      }
				/* be careful of trailing whitespace on lines */
      if ((s1 = sre_strtok(&s, WHITESPACE, &slen)) == NULL)
	Die("Failed to parse sequence at line %d of PHYLIP file %s\n", 
	    afp->linenumber, afp->fname);
      msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], s1, slen);

      idx++;
      if (idx == nseq) { idx = 0; nblock++; }
    }
  msa->nseq = nseq;
  MSAVerifyParse(msa);		/* verifies; sets alen, wgt; frees sqlen[] */
  return msa;
}
/* Function: ReadClustal()
 * Date:     SRE, Sun Jun  6 17:53:49 1999 [bus from Madison, 1999 worm mtg]
 *
 * Purpose:  Parse an alignment read from an open Clustal format
 *           alignment file. Clustal is a single-alignment format.
 *           Return the alignment, or NULL if we have no data.
 *           
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA * - an alignment object
 *                   caller responsible for an MSAFree()
 *           NULL if no more alignments
 *
 * Diagnostics: 
 *           Will Die() here with a (potentially) useful message
 *           if a parsing error occurs.
 */
MSA *
ReadClustal(MSAFILE *afp)
{
  MSA    *msa;
  char   *s;
  int     slen;
  int     sqidx;
  char   *name;
  char   *seq;
  char   *s2;

  if (feof(afp->f)) return NULL;

  /* Skip until we see the CLUSTAL header
   */
  while ((s = MSAFileGetLine(afp)) != NULL)
    {
      if (strncmp(s, "CLUSTAL", 7) == 0 &&
	  strstr(s, "multiple sequence alignment") != NULL)
	break;
    }
  if (s == NULL) return NULL;

  msa = MSAAlloc(10, 0);

  /* Now we're in the sequence section. 
   * As discussed above, if we haven't seen a sequence name, then we
   * don't include the sequence in the alignment.
   * Watch out for conservation markup lines that contain *.: chars
   */
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      if ((name = sre_strtok(&s, WHITESPACE, NULL))  == NULL) continue;
      if ((seq  = sre_strtok(&s, WHITESPACE, &slen)) == NULL) continue;
      s2 = sre_strtok(&s, "\n", NULL);

      /* The test for a conservation markup line
       */
      if (strpbrk(name, ".*:") != NULL && strpbrk(seq, ".*:") != NULL)
	continue;
      if (s2 != NULL)
	Die("Parse failed at line %d, file %s: possibly using spaces as gaps",
	    afp->linenumber, afp->fname);
  
      /* It's not blank, and it's not a coord line: must be sequence
       */
      sqidx = MSAGetSeqidx(msa, name, msa->lastidx+1);
      msa->lastidx = sqidx;
      msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); 
    }

  MSAVerifyParse(msa);		/* verifies, and also sets alen and wgt. */
  return msa;
}
Ejemplo n.º 3
0
Archivo: a2m.c Proyecto: rforge/phyexe
/* Function: ReadA2M()
 * Date:     SRE, Sun Jun  6 17:11:29 1999 [bus from Madison 1999 worm mtg]
 *
 * Purpose:  Parse an alignment read from an open A2M format
 *           alignment file. A2M is a single alignment format.
 *           Return the alignment, or NULL if we've already
 *           read the alignment.
 *
 * Args:     afp - open alignment file
 *
 * Returns:  MSA *  - an alignment object. 
 *                    Caller responsible for an MSAFree()
 */
MSA *
ReadA2M(MSAFILE *afp)
{
  MSA  *msa;
  char *buf;
  char *name;
  char *desc;
  char *seq;
  int   idx;
  int   len1, len2;
  
  if (feof(afp->f)) return NULL;

  name = NULL;
  msa  = MSAAlloc(10, 0);
  idx  = 0;
  while ((buf = MSAFileGetLine(afp)) != NULL) 
    {
      if (*buf == '>') 
	{
	  buf++;		/* skip the '>' */
	  if ((name = sre_strtok(&buf, WHITESPACE, &len1)) == NULL)
	    Die("Blank name in A2M file %s (line %d)\n", afp->fname, afp->linenumber);
	  desc = sre_strtok(&buf, "\n", &len2);
	
	  idx = GKIStoreKey(msa->index, name);
	  if (idx >= msa->nseqalloc) MSAExpand(msa);

	  msa->sqname[idx] = sre_strdup(name, len1);
	  if (desc != NULL) MSASetSeqDescription(msa, idx, desc);
	  msa->nseq++;
	} 
      else if (name != NULL) 
	{
	  if ((seq = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) continue; 
	  msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], seq, len1);
	}
    } 
  if (name == NULL) { MSAFree(msa); return NULL; }

  MSAVerifyParse(msa);
  return msa;
}
Ejemplo n.º 4
0
/**
 * @brief Write alignment to file.
 *
 * @param[in] mseq
 * The mseq_t struct containing the aligned sequences
 * @param[in] pcAlnOutfile
 * The name of the output file
 * @param[in] outfmt
 * The alignment output format (defined in squid.h)
 * @param[in] iWrap
 * length of line for Clustal/Fasta format
 *
 * @return Non-zero on error
 *
 * @note We create a temporary squid MSA struct in here because we never
 * use it within clustal. We might be better of using the old clustal
 * output routines instead.
 *
 */
int
WriteAlignment(mseq_t *mseq, const char *pcAlnOutfile, int outfmt, int iWrap, bool bResno)
{
    int i; /* aux */
    MSA *msa; /* squid's alignment structure */
    FILE *pfOut = NULL;
    int key; /* MSA struct internal index for sequence */
    int alen; /* alignment length */
    bool use_stdout;

    assert(mseq!=NULL);

    if (MSAFILE_UNKNOWN == outfmt) {
        Log(&rLog, LOG_ERROR, "Unknown output format chosen");
        return -1;
    }

    if (NULL == pcAlnOutfile) {
        pfOut = stdout;
        use_stdout = TRUE;
    } else {
        use_stdout = FALSE;
        if (NULL == (pfOut = fopen(pcAlnOutfile, "w"))) {
            Log(&rLog, LOG_ERROR, "Could not open file %s for writing", pcAlnOutfile);
            return -1;
        }
    }


    /* derive alignment length from first seq */
    alen = strlen(mseq->seq[0]);

    msa  = MSAAlloc(mseq->nseqs, alen);

    /* basic structure borrowed code from squid-1.9g/a2m.c:ReadA2M()
     * we actually create a copy of mseq. keeping the pointers becomes
     * messy when calling MSAFree()
     */
    for (i=0; i<mseq->nseqs; i++) {
        char *this_name = NULL; /* mseq sequence name */
        char *this_seq = NULL; /* mseq sequence */
        SQINFO *this_sqinfo = NULL; /* mseq sequence name */
        int iI;

        /* mseq->tree_order encodes to order in which sequences are listed in the guide-tree,
           if the user wants the sequence output in the input-order then mseq->tree_order==NULL,
           otherwise mseq->tree_order!=NULL, containing the indices of the sequences, FS, r274 ->  */
        iI = (NULL == mseq->tree_order) ? i : mseq->tree_order[i];

        this_name = mseq->sqinfo[iI].name; /* mseq sequence name */
        this_seq = mseq->seq[iI]; /* mseq sequence */
        this_sqinfo = &mseq->sqinfo[iI]; /* mseq sequence name */

        key = GKIStoreKey(msa->index, this_name);
        msa->sqname[key] = sre_strdup(this_name, strlen(this_name));

        /* setting msa->sqlen[idx] and msa->aseq[idx] */
        msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
                                     this_seq, strlen(this_seq));

        if (this_sqinfo->flags & SQINFO_DESC) {
            /* FIXME never get here ... */
            MSASetSeqDescription(msa, key, this_sqinfo->desc);
        }
        /* FIXME extend this by copying more stuff according to flags.
         * See MSAFileRead() in msa.c and used functions there
         *
         * Problem is that we never parse MSA information as we use squid'sSeqFile
         */

        msa->nseq++;

    } /* 0 <= i < mseq->nseqs */


    /* FIXME Would like to, but can't use MSAVerifyParse(msa) here, as it
     * will die on error. Need to implement our own version
     */
#if 0
    MSAVerifyParse(msa);
#endif

    /* The below is copy of MSAFileWrite() which originally only writes to stdout.
     */

    /* Be sloppy and make a2m and fasta the same. same for vienna (which is
       the same). same same. can can. boleh boleh */
    if (outfmt==SQFILE_FASTA)
        outfmt = MSAFILE_A2M;
    if (outfmt==SQFILE_VIENNA)
        outfmt = MSAFILE_VIENNA;

    switch (outfmt) {
    case MSAFILE_A2M:
        /*WriteA2M(pfOut, msa, 0);*/
        WriteA2M(pfOut, msa, iWrap);
        break;
    case MSAFILE_VIENNA:
        /*WriteA2M(pfOut, msa, 1);*/
        WriteA2M(pfOut, msa, INT_MAX);
        break;
    case MSAFILE_CLUSTAL:
        WriteClustal(pfOut, msa, iWrap, TRUE==bResno ? 1 : 0, mseq->seqtype);
        break;
    case MSAFILE_MSF:
        WriteMSF(pfOut, msa);
        break;
    case MSAFILE_PHYLIP:
        WritePhylip(pfOut, msa);
        break;
    case MSAFILE_SELEX:
        WriteSELEX(pfOut, msa);
        break;
    case MSAFILE_STOCKHOLM:
        WriteStockholm(pfOut, msa);
        break;
    default:
        Log(&rLog, LOG_FATAL, "internal error: %s",
            "invalid output format should have been detected before");
    }

    if (use_stdout == FALSE) {
        (void) fclose(pfOut);
        Log(&rLog, LOG_INFO,
            "Alignment written to %s", pcAlnOutfile);
    }
    MSAFree(msa);

    return 0;
}
/* Function: ReadMSF()
 * Date:     SRE, Tue Jun  1 08:07:22 1999 [St. Louis]
 *
 * Purpose:  Parse an alignment read from an open MSF format
 *           alignment file. (MSF is a single-alignment format.)
 *           Return the alignment, or NULL if we've already
 *           read the alignment.
 *           
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA * - an alignment object
 *                   caller responsible for an MSAFree()
 *           NULL if no more alignments
 *
 * Diagnostics: 
 *           Will Die() here with a (potentially) useful message
 *           if a parsing error occurs.
 */
MSA *
ReadMSF(MSAFILE *afp)
{
  MSA    *msa;
  char   *s;
  int     alleged_alen;
  int     alleged_type;
  int     alleged_checksum;
  char   *tok;
  char   *sp;
  int     slen;
  int     sqidx;
  char   *name;
  char   *seq;

  if (feof(afp->f)) return NULL;
  if ((s = MSAFileGetLine(afp)) == NULL) return NULL;

  /* The first line is the header.
   * This is a new-ish GCG feature. Don't count on it, so
   * we can be a bit more tolerant towards non-GCG software
   * generating "MSF" files.
   */
  msa = MSAAlloc(10, 0);
  if      (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
    msa->type = kAmino;
    if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
  } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
    msa->type = kRNA;
    if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
  }

  /* Now we're in the free text comment section of the MSF file.
   * It ends when we see the "MSF: Type: Check: .." line.
   * This line must be present. 
   */
  do
    {
      if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
	  Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
	{
	  alleged_alen     = atoi(sqd_parse[0]);
	  switch (*(sqd_parse[1])) {
	  case 'N' : alleged_type = kRNA;      break;
	  case 'P' : alleged_type = kAmino;    break;  
	  case 'X' : alleged_type = kOtherSeq; break;
	  default  : alleged_type = kOtherSeq; 
	  }
	  alleged_checksum = atoi(sqd_parse[3]);
	  if (msa->type == kOtherSeq) msa->type = alleged_type;
	  break;		/* we're done with comment section. */
	}
      if (! IsBlankline(s)) 
	MSAAddComment(msa, s);
    } while ((s = MSAFileGetLine(afp)) != NULL); 

  /* Now we're in the name section.
   * GCG has a relatively poorly documented feature: only sequences that
   * appear in this list will be read from the alignment section. Commenting
   * out sequences in the name list (by preceding them with "!") is
   * allowed as a means of manually defining subsets of sequences in
   * the alignment section. We can support this feature reasonably
   * easily because of the hash table for names in the MSA: we
   * only add names to the hash table when we see 'em in the name section.
   */
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */

      if      (*s == '\n')   continue;                 /* skip blank lines */
      else if (*s == '!')    MSAAddComment(msa, s);
      else if ((sp  = strstr(s, "Name:")) != NULL) 
	{
				/* We take the name and the weigh, and that's it */
	  sp   += 5;
	  tok   = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
	  sqidx = GKIStoreKey(msa->index, tok);
	  if (sqidx >= msa->nseqalloc) MSAExpand(msa);
	  msa->sqname[sqidx] = sre_strdup(tok, slen);
	  msa->nseq++;

	  if ((sp = strstr(sp, "Weight:")) == NULL)
	    Die("No Weight: on line %d for %s in name section of MSF file %s\n",
		afp->linenumber, msa->sqname[sqidx],  afp->fname);
	  sp += 7;
	  tok = sre_strtok(&sp, " \t", &slen);
	  msa->wgt[sqidx] = atof(tok);
	  msa->flags |= MSA_SET_WGT;
	}
      else if (strncmp(s, "//", 2) == 0)
	break;
      else
	{
	  Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
	      afp->linenumber, afp->fname, s);
	  squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
	  return NULL;
	}

    }

  /* And now we're in the sequence section. 
   * As discussed above, if we haven't seen a sequence name, then we
   * don't include the sequence in the alignment.
   * Also, watch out for coordinate-only lines.
   */
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      sp  = s;
      if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
      if ((seq  = sre_strtok(&sp, "\n",  &slen)) == NULL) continue;
      
      /* The test for a coord line: digits starting both fields
       */
      if (isdigit((int) *name) && isdigit((int) *seq))
	continue;
  
      /* It's not blank, and it's not a coord line: must be sequence
       */
      sqidx = GKIKeyIndex(msa->index, name);
      if (sqidx < 0) continue;	/* not a sequence we recognize */
      
      msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); 
    }
  
  /* We've left blanks in the aseqs; take them back out.
   */
  for (sqidx = 0; sqidx <  msa->nseq; sqidx++)
    {
      if (msa->aseq[sqidx] == NULL)
	Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
      
      for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
	{
	  if (*s == ' ' || *s == '\t') {
	    msa->sqlen[sqidx]--;
	  } else {
	    *sp = *s;
	    sp++;
	  }
	}
      *sp = '\0';
    }
  
  MSAVerifyParse(msa);		/* verifies, and also sets alen and wgt. */
  return msa;
}
Ejemplo n.º 6
0
/* Function: ReadSELEX()
 * Date:     SRE, Sun Jun  6 18:24:09 1999 [St. Louis]
 *
 * Purpose:  Parse an alignment read from an open SELEX format
 *           alignment file. (SELEX is a single alignment format).
 *           Return the alignment, or NULL if we've already read the
 *           alignment or there's no alignment data in the file.
 *           
 * Limitations: SELEX is the only remaining multipass parser for
 *           alignment files. It cannot read from gzip or from stdin.
 *           It Die()'s here if you try. The reason for this
 *           that SELEX allows space characters as gaps, so we don't
 *           know the borders of an alignment block until we've seen
 *           the whole block. I could rewrite to allow single-pass
 *           parsing (by storing the whole block in memory) but
 *           since SELEX is now legacy, why bother.
 *           
 *           Note that the interface is totally kludged: fastest
 *           possible adaptation of old ReadSELEX() to the new
 *           MSA interface.  
 *
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA *  - an alignment object
 *                    caller responsible for an MSAFree()
 *           NULL if no alignment data.          
 */
MSA *
ReadSELEX(MSAFILE *afp)
{
  MSA     *msa;                 /* RETURN: mult seq alignment   */
  FILE    *fp;                  /* ptr to opened seqfile        */
  char   **aseqs;               /* aligned seqs                 */
  int      num = 0;		/* number of seqs read          */
  char     buffer[LINEBUFLEN];	/* input buffer for lines       */
  char     bufcpy[LINEBUFLEN];	/* strtok'able copy of buffer   */
  struct block_struc {          /** alignment data for a block: */
    int lcol;			/* furthest left aligned sym    */
    int rcol;			/* furthest right aligned sym   */
  } *blocks = NULL;
  int      blocknum;		/* number of blocks in file     */
  char    *nptr;                /* ptr to start of name on line */
  char    *sptr;                /* ptr into sequence on line    */
  int      currnum;		/* num. seqs in given block     */
  int      currblock;		/* index for blocks             */
  int      i;			/* loop counter                 */
  int      seqidx;		/* counter for seqs             */
  int      alen;                /* length of alignment          */
  int      warn_names;          /* becomes TRUE if names don't match between blocks */
  int      headnum;		/* seqidx in per-sequence header info */
  int      currlen;
  int      count;
  int      have_cs = 0;
  int      have_rf = 0;
  AINFO    base_ainfo, *ainfo;	/* hack: used to be passed ptr to AINFO */


  /* Convert from MSA interface to what old ReadSELEX() did:
   *     - copy our open fp, rather than opening file
   *     - verify that we're not reading a gzip or stdin
   */
  if (feof(afp->f)) return NULL;
  if (afp->do_gzip || afp->do_stdin)
    Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file"); 
  fp    = afp->f;
  ainfo = &base_ainfo;

  /***************************************************
   * First pass across file. 
   * Count seqs, get names, determine column info
   * Determine what sorts of info are active in this file.
   ***************************************************/

  InitAinfo(ainfo);
				/* get first line of the block 
				 * (non-comment, non-blank) */
  do
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	{ squid_errno = SQERR_NODATA; return 0; }
      strcpy(bufcpy, buffer);
      if (*buffer == '#')
	{
	  if      (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	  else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;
	}
    }
  while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	 (strchr(commentsyms, *nptr) != NULL));

  blocknum   = 0;
  warn_names = FALSE;
  while (!feof(fp))
    {
				/* allocate for info about this block. */
      if (blocknum == 0)
	blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));
      else 
	blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));
      blocks[blocknum].lcol = LINEBUFLEN+1;
      blocks[blocknum].rcol = -1;
	
      currnum = 0;
      while (nptr != NULL)	/* becomes NULL when this block ends. */
      {
				/* First block only: save names */
	if (blocknum == 0)
	  {
	    if (currnum == 0)
	      ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO));
	    else 
	      ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO));

	    ainfo->sqinfo[currnum].flags = 0;
	    SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME);
	  }
	else			/* in each additional block: check names */
	  {
	    if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0)
	      warn_names = TRUE;
	  }
	currnum++;

				/* check rcol, lcol */
	if ((sptr = strtok(NULL, WHITESPACE)) != NULL)
	  {
				/* is this the furthest left we've
				   seen word 2 in this block? */
	    if (sptr - bufcpy < blocks[blocknum].lcol) 
	      blocks[blocknum].lcol = sptr - bufcpy;
				/* look for right side in buffer */
	    for (sptr = buffer + strlen(buffer) - 1;  
		 strchr(WHITESPACE, *sptr) != NULL;
		 sptr --)
	      /* do nothing */ ;
	    if (sptr - buffer > blocks[blocknum].rcol)
	      blocks[blocknum].rcol = sptr - buffer;
	  }

				/* get the next line; blank line means end of block */
	do
	  {
	    if (fgets(buffer, LINEBUFLEN, fp) == NULL) 
	      { nptr = NULL; break; }
	    strcpy(bufcpy, buffer);

	    if      (strncmp(buffer, "#=SS",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS;
	    else if (strncmp(buffer, "#=SA",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA;
	    else if (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	    else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;

	    if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) 
	      break;
	  } while (strchr(commentsyms, *nptr) != NULL);
      }


				/* check that number of sequences matches expected */
      if (blocknum == 0)
	num = currnum;
      else if (currnum != num)
	Die("Parse error in ReadSELEX()");
      blocknum++;

				/* get first line of next block 
				 * (non-comment, non-blank) */
      do
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; }
	  strcpy(bufcpy, buffer);
	}
      while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	     (strchr(commentsyms, *nptr) != NULL));
    }

  
  /***************************************************
   * Get ready for second pass:
   *   figure out the length of the alignment
   *   malloc space
   *   rewind the file
   ***************************************************/

  alen = 0;
  for (currblock = 0; currblock < blocknum; currblock++)
    alen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

  rewind(fp);

  /* allocations. we can't use AllocateAlignment because of
   * the way we already used ainfo->sqinfo.
   */
  aseqs     = (char **) MallocOrDie (num * sizeof(char *));
  if (have_cs) 
    ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char));
  if (have_rf) 
    ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char));

  
  
  for (i = 0; i < num; i++)
    {
      aseqs[i]     = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SS)
	ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SA)
	ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char));
    }
  
  ainfo->alen = alen;
  ainfo->nseq = num; 
  ainfo->wgt  = (float *) MallocOrDie (sizeof(float) * num);
  FSet(ainfo->wgt, num, 1.0);

  /***************************************************
   * Second pass across file. Parse header; assemble sequences
   ***************************************************/
  /* We've now made a complete first pass over the file. We know how
   * many blocks it contains, we know the number of seqs in the first
   * block, and we know every block has the same number of blocks;
   * so we can be a bit more cavalier about error-checking as we
   * make the second pass.
   */

  /* Look for header
   */
  headnum = 0;
  for (;;)
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	Die("Parse error in ReadSELEX()");
      strcpy(bufcpy, buffer);
      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */

      if (strcmp(nptr, "#=AU") == 0  && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->au = Strdup(sptr);
      else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->name = Strdup(sptr);
      else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->acc  = Strdup(sptr);
      else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->desc = Strdup(sptr);
      else if (strcmp(nptr, "#=GA") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga2 = atof(sptr);

	  ainfo->flags |= AINFO_GA;
	}
      else if (strcmp(nptr, "#=TC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc2 = atof(sptr);

	  ainfo->flags |= AINFO_TC;
	}
      else if (strcmp(nptr, "#=NC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc2 = atof(sptr);

	  ainfo->flags |= AINFO_NC;
	}
      else if (strcmp(nptr, "#=SQ") == 0)      /* per-sequence header info */
	{
				/* first field is the name */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE;

				/* second field is the weight */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (!IsReal(sptr)) 
	    Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number");
	  ainfo->wgt[headnum] = atof(sptr);

				/* third field is database source id */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID);

				/* fourth field is database accession number */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC);

				/* fifth field is start..stop::olen */
	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START);

	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP);
	  
	  if ((sptr = strtok(NULL, ":\t ")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN);

				/* rest of line is optional description */
	  if ((sptr = strtok(NULL, "\n")) != NULL)
	    SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC);
	  
	  headnum++;
	}
      else if (strcmp(nptr, "#=CS") == 0) break;
      else if (strcmp(nptr, "#=RF") == 0) break;
      else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */
    }
  

  currlen = 0;
  for (currblock = 0 ; currblock < blocknum; currblock++)
    {
				/* parse the block */
      seqidx = 0;
      while (nptr != NULL)
	{
				/* Consensus structure */
	  if (strcmp(nptr, "#=CS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=CS line in ReadSELEX()");
	    }

				/* Reference coordinates */
	  else if (strcmp(nptr, "#=RF") == 0)
	    {
	      if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=RF line in ReadSELEX()");
	    }
				/* Individual secondary structure */
	  else if (strcmp(nptr, "#=SS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SS line in ReadSELEX()");
	    }

				/* Side chain % surface accessibility code */
	  else if (strcmp(nptr, "#=SA") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SA line in ReadSELEX()");
	    }
				/* Aligned sequence; avoid unparsed machine comments */
	  else if (strncmp(nptr, "#=", 2) != 0)
	    {
	      if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in alignment line in ReadSELEX()");
	      seqidx++;
	    }

				/* get next line */
	  for (;;)
	    {
	      nptr = NULL;
	      if (fgets(buffer, LINEBUFLEN, fp) == NULL) break;	/* EOF */
	      strcpy(bufcpy, buffer);
	      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */
	      if (strncmp(buffer, "#=", 2) == 0) break;      /* machine comment */
	      if (strchr(commentsyms, *nptr) == NULL) break; /* data */
	    }
	} /* end of a block */

      currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

				/* get line 1 of next block */
      for (;;)
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */
	  strcpy(bufcpy, buffer);
	  if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */
	  if (strncmp(buffer, "#=", 2) == 0)       break; /* machine comment */
	  if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */
	}
    } /* end of the file */

  /* Lengths in sqinfo are for raw sequence (ungapped),
   * and SS, SA are 0..rlen-1 not 0..alen-1.
   * Only the seqs with structures come out of here with lengths set.
   */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      int apos, rpos;
				/* secondary structures */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SS)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].ss[rpos] = '\0';
	}
				/* Surface accessibility */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SA)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].sa[rpos] = '\0';
	}
    }

				/* NULL-terminate all the strings */
  if (ainfo->rf != NULL) ainfo->rf[alen] = '\0';
  if (ainfo->cs != NULL) ainfo->cs[alen] = '\0';
  for (seqidx = 0; seqidx < num; seqidx++)
    aseqs[seqidx][alen]            = '\0';
  
				/* find raw sequence lengths for sqinfo */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      count = 0;
      for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++)
	if (!isgap(*sptr)) count++;
      ainfo->sqinfo[seqidx].len    = count;
      ainfo->sqinfo[seqidx].flags |= SQINFO_LEN;
    }


  /***************************************************
   * Garbage collection and return
   ***************************************************/
  free(blocks);
  if (warn_names) 
    Warn("sequences may be in different orders in blocks of %s?", afp->fname);

  /* Convert back to MSA structure. (Wasteful kludge.)
   */
  msa = MSAFromAINFO(aseqs, ainfo);
  MSAVerifyParse(msa);
  FreeAlignment(aseqs, ainfo);
  return msa;
}