Ejemplo n.º 1
0
static void 
readGenBank(struct ReadSeqVars *V)
{
  char *sptr;
  int   in_definition;

  while (strncmp(V->sbuffer, "LOCUS", 5) != 0)
    getline2(V);

  if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL)
    {
      SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
      SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
    }

  in_definition = FALSE;
  while (! feof(V->f))
    {
      getline2(V);
      if (! feof(V->f) && strstr(V->sbuffer, "DEFINITION") == V->sbuffer)
	{
	  if ((sptr = strtok(V->sbuffer+12, "\n")) != NULL)
	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
	  in_definition = TRUE;
	}
      else if (! feof(V->f) && strstr(V->sbuffer, "ACCESSION") == V->sbuffer)
	{
	  if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL)
	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
	  in_definition = FALSE;
	}
      else if (strncmp(V->sbuffer,"ORIGIN", 6) != 0)
	{
	  if (in_definition)
	    SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC);
	}
      else
	break;
    }

  readLoop(0, endGB, V);

  /* reading a real GenBank database file, we keep the source coords
   */
  V->sqinfo->start = 1;
  V->sqinfo->stop  = V->seqlen;
  V->sqinfo->olen  = V->seqlen;
  V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;


  while (!(feof(V->f) || ((*V->sbuffer!=0) && (strstr(V->sbuffer,"LOCUS") == V->sbuffer))))
    getline2(V);
				/* SRE: V->s now holds "//", so sequential
				   reads are wedged: fixed Tue Jul 13 1993 */
  while (!feof(V->f) && strstr(V->sbuffer, "LOCUS  ") != V->sbuffer)
    getline2(V);
}
Ejemplo n.º 2
0
static void 
readPearson(struct ReadSeqVars *V)
{
  char *sptr;

  if ((sptr = strtok(V->sbuffer+1, "\n\t ")) != NULL)
    SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
  if ((sptr = strtok(NULL, "\n")) != NULL)
    SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
				/* workaround for long NCBI NR lines */
  while (V->longline && ! feof(V->f)) getline2(V);

  readLoop(0, endPearson, V);

  while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>'))))
    getline2(V);
}
Ejemplo n.º 3
0
static void
readZuker(struct ReadSeqVars *V)
{
  char *sptr;

  getline2(V);  /*s == "seqLen seqid string..."*/

  if ((sptr = strtok(V->sbuffer+6, " \t\n")) != NULL)
    SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);

  if ((sptr = strtok(NULL, "\n")) != NULL)
    SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);

  readLoop(0, endZuker, V);

  while (!(feof(V->f) | ((*V->sbuffer != '\0') & (*V->sbuffer == '('))))
    getline2(V);
}
Ejemplo n.º 4
0
static void 
readEMBL(struct ReadSeqVars *V)
{
  char *sptr;

				/* make sure we have first line */
  while (!feof(V->f) && strncmp(V->sbuffer, "ID  ", 4) != 0)
    getline2(V);

  if ((sptr = strtok(V->sbuffer+5, "\n\t ")) != NULL)
    {
      SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
      SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
    }

  do {
    getline2(V);
    if (!feof(V->f) && strstr(V->sbuffer, "AC  ") == V->sbuffer)
      {
	if ((sptr = strtok(V->sbuffer+5, ";  \t\n")) != NULL)
	  SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
      }
    else if (!feof(V->f) && strstr(V->sbuffer, "DE  ") == V->sbuffer)
      {
	if ((sptr = strtok(V->sbuffer+5, "\n")) != NULL)
	  SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
      }
  } while (! feof(V->f) && strncmp(V->sbuffer,"SQ",2) != 0);
  
  readLoop(0, endEMBL, V);

  /* reading a real EMBL database file, we keep the source coords
   */
  V->sqinfo->start = 1;
  V->sqinfo->stop  = V->seqlen;
  V->sqinfo->olen  = V->seqlen;
  V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;

				/* load next record's ID line */
  while (!feof(V->f) && strncmp(V->sbuffer, "ID  ", 4) != 0)
    getline2(V);
}
Ejemplo n.º 5
0
static void
readGCGdata(struct ReadSeqVars *V)
{
  int   binary = FALSE;		/* whether data are binary or not */
  int   blen;			/* length of binary sequence */
  
				/* first line contains ">>>>" followed by name */
  if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->sbuffer, 2) == 0) 
    {
      binary = TRUE;
      SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
      blen = atoi(sqd_parse[2]);
    } 
  else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->sbuffer, 1) == 0) 
    SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
  else 
    Die("bogus GCGdata format? %s", V->sbuffer);

				/* second line contains free text description */
  getline2(V);
  SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC);

  if (binary) {
    /* allocate for blen characters +3... (allow for 3 bytes of slop) */
    if (blen >= V->maxseq) {
      V->maxseq = blen;
      if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL)
	Die("malloc failed");
    }
				/* read (blen+3)/4 bytes from file */
    if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4))
      Die("fread failed");
    V->seqlen = blen;
				/* convert binary code to seq */
    GCGBinaryToSequence(V->seq, blen);
  }
  else readLoop(0, endGCGdata, V);
  
  while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>'))))
    getline2(V);
}
Ejemplo n.º 6
0
static void
readPIR(struct ReadSeqVars *V)
{
  char *sptr;
				/* load first line of entry  */
  while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0)
    getline2(V);
  if (feof(V->f)) return;

  if ((sptr = strtok(V->sbuffer + 15, "\n\t ")) != NULL)
    {
      SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
      SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
    }
  do {
    getline2(V);
    if (!feof(V->f) && strncmp(V->sbuffer, "TITLE", 5) == 0)
      SetSeqinfoString(V->sqinfo, V->sbuffer+15, SQINFO_DESC);
    else if (!feof(V->f) && strncmp(V->sbuffer, "ACCESSION", 9) == 0)
      {
	if ((sptr = strtok(V->sbuffer+15, " \t\n")) != NULL)
	  SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
      }
  } while (! feof(V->f) && (strncmp(V->sbuffer,"SEQUENCE", 8) != 0));
  getline2(V);			/* skip next line, coords */

  readLoop(0, endPIR, V);

  /* reading a real PIR-CODATA database file, we keep the source coords
   */
  V->sqinfo->start = 1;
  V->sqinfo->stop  = V->seqlen;
  V->sqinfo->olen  = V->seqlen;
  V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;

  /* get next line
   */
  while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0)
    getline2(V);
}
Ejemplo n.º 7
0
static int 
parse_MSF(FILE *fp, AINFO *ainfo)
{
  char buffer[LINEBUFLEN];
  char *sptr;
  int  nseq;
  
  /* Get first dividing line. MSF format specifies ints after MSF: and Check:
   * but we don't make sure of this 
   */
  do {
    if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_FORMAT; return 0; }
  } while (strstr(buffer, " MSF: ") == NULL ||
	   strstr(buffer, " Check: ") == NULL ||
	   strstr(buffer, " ..") == NULL);

  /* Get names, weights from header
   */
  nseq = 0;
  /*CONSTCOND*/
  while (1) 
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL) { squid_errno = SQERR_FORMAT; return 0; }
      if (is_blankline(buffer)) continue; 
      if (strncmp(buffer, "//", 2) == 0) break;

      sptr = strtok(buffer, WHITESPACE);
      if (sptr == NULL || strcmp(sptr, "Name:") != 0 || strstr(sptr+5, "Weight:") != 0)
	{ squid_errno = SQERR_FORMAT; return 0; }
      
      if ((sptr = strtok(NULL, WHITESPACE)) == NULL) {squid_errno=SQERR_FORMAT; return 0; }
      SetSeqinfoString(&(ainfo->sqinfo[nseq]), sptr, SQINFO_NAME);

      while (sptr != NULL && strcmp(sptr, "Weight:") != 0) sptr = strtok(NULL, WHITESPACE);
      if ((sptr = strtok(NULL, WHITESPACE)) == NULL) {squid_errno=SQERR_FORMAT; return 0; }
      if (! IsReal(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
      ainfo->wgt[nseq] = atof(sptr);

      nseq++;
    }

  if (nseq != ainfo->nseq) { squid_errno = SQERR_FORMAT; return 0; }
  return 1;
}
Ejemplo n.º 8
0
static void 
readIG(struct ReadSeqVars *V)
{
  char *nm;
				/* position past ';' comments */
  do {
    getline2(V);
  } while (! (feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer != ';')) ));

  if (!feof(V->f))
    {
      if ((nm = strtok(V->sbuffer, "\n\t ")) != NULL)
	SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);

      readLoop(0, endIG, V);
    }
  
  while (!(feof(V->f) || ((*V->sbuffer != '\0') && (*V->sbuffer == ';'))))
    getline2(V);
}
Ejemplo n.º 9
0
static void 
readUWGCG(struct ReadSeqVars *V)
{
  char  *si;
  char  *sptr;
  int    done;

  V->seqlen = 0;

  /*writeseq: "    %s  Length: %d  (today)  Check: %d  ..\n" */
  /*drop above or ".." from id*/
  if ((si = strstr(V->sbuffer,"  Length: ")) != NULL) *si = 0;
  else if ((si = strstr(V->sbuffer,"..")) != NULL)    *si = 0;

  if ((sptr = strtok(V->sbuffer, "\n\t ")) != NULL)
    SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);

  do {
    done = feof(V->f);
    getline2(V);
    if (! done) addseq(V->sbuffer, V);
  } while (!done);
}
Ejemplo n.º 10
0
static void 
readStrider(struct ReadSeqVars *V)
{ 
  char *nm;
  
  while ((!feof(V->f)) && (*V->sbuffer == ';')) 
    {
      if (strncmp(V->sbuffer,"; DNA sequence", 14) == 0)
	{
	  if ((nm = strtok(V->sbuffer+16, ",\n\t ")) != NULL)
	    SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
	}
      getline2(V);
    }

  if (! feof(V->f))
    readLoop(1, endStrider, V);

  /* load next line
   */
  while ((!feof(V->f)) && (*V->sbuffer != ';')) 
    getline2(V);
}
Ejemplo n.º 11
0
/* Function: ReadSELEX()
 * Date:     SRE, Sun Jun  6 18:24:09 1999 [St. Louis]
 *
 * Purpose:  Parse an alignment read from an open SELEX format
 *           alignment file. (SELEX is a single alignment format).
 *           Return the alignment, or NULL if we've already read the
 *           alignment or there's no alignment data in the file.
 *           
 * Limitations: SELEX is the only remaining multipass parser for
 *           alignment files. It cannot read from gzip or from stdin.
 *           It Die()'s here if you try. The reason for this
 *           that SELEX allows space characters as gaps, so we don't
 *           know the borders of an alignment block until we've seen
 *           the whole block. I could rewrite to allow single-pass
 *           parsing (by storing the whole block in memory) but
 *           since SELEX is now legacy, why bother.
 *           
 *           Note that the interface is totally kludged: fastest
 *           possible adaptation of old ReadSELEX() to the new
 *           MSA interface.  
 *
 * Args:     afp  - open alignment file
 *
 * Returns:  MSA *  - an alignment object
 *                    caller responsible for an MSAFree()
 *           NULL if no alignment data.          
 */
MSA *
ReadSELEX(MSAFILE *afp)
{
  MSA     *msa;                 /* RETURN: mult seq alignment   */
  FILE    *fp;                  /* ptr to opened seqfile        */
  char   **aseqs;               /* aligned seqs                 */
  int      num = 0;		/* number of seqs read          */
  char     buffer[LINEBUFLEN];	/* input buffer for lines       */
  char     bufcpy[LINEBUFLEN];	/* strtok'able copy of buffer   */
  struct block_struc {          /** alignment data for a block: */
    int lcol;			/* furthest left aligned sym    */
    int rcol;			/* furthest right aligned sym   */
  } *blocks = NULL;
  int      blocknum;		/* number of blocks in file     */
  char    *nptr;                /* ptr to start of name on line */
  char    *sptr;                /* ptr into sequence on line    */
  int      currnum;		/* num. seqs in given block     */
  int      currblock;		/* index for blocks             */
  int      i;			/* loop counter                 */
  int      seqidx;		/* counter for seqs             */
  int      alen;                /* length of alignment          */
  int      warn_names;          /* becomes TRUE if names don't match between blocks */
  int      headnum;		/* seqidx in per-sequence header info */
  int      currlen;
  int      count;
  int      have_cs = 0;
  int      have_rf = 0;
  AINFO    base_ainfo, *ainfo;	/* hack: used to be passed ptr to AINFO */


  /* Convert from MSA interface to what old ReadSELEX() did:
   *     - copy our open fp, rather than opening file
   *     - verify that we're not reading a gzip or stdin
   */
  if (feof(afp->f)) return NULL;
  if (afp->do_gzip || afp->do_stdin)
    Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file"); 
  fp    = afp->f;
  ainfo = &base_ainfo;

  /***************************************************
   * First pass across file. 
   * Count seqs, get names, determine column info
   * Determine what sorts of info are active in this file.
   ***************************************************/

  InitAinfo(ainfo);
				/* get first line of the block 
				 * (non-comment, non-blank) */
  do
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	{ squid_errno = SQERR_NODATA; return 0; }
      strcpy(bufcpy, buffer);
      if (*buffer == '#')
	{
	  if      (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	  else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;
	}
    }
  while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	 (strchr(commentsyms, *nptr) != NULL));

  blocknum   = 0;
  warn_names = FALSE;
  while (!feof(fp))
    {
				/* allocate for info about this block. */
      if (blocknum == 0)
	blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));
      else 
	blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));
      blocks[blocknum].lcol = LINEBUFLEN+1;
      blocks[blocknum].rcol = -1;
	
      currnum = 0;
      while (nptr != NULL)	/* becomes NULL when this block ends. */
      {
				/* First block only: save names */
	if (blocknum == 0)
	  {
	    if (currnum == 0)
	      ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO));
	    else 
	      ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO));

	    ainfo->sqinfo[currnum].flags = 0;
	    SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME);
	  }
	else			/* in each additional block: check names */
	  {
	    if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0)
	      warn_names = TRUE;
	  }
	currnum++;

				/* check rcol, lcol */
	if ((sptr = strtok(NULL, WHITESPACE)) != NULL)
	  {
				/* is this the furthest left we've
				   seen word 2 in this block? */
	    if (sptr - bufcpy < blocks[blocknum].lcol) 
	      blocks[blocknum].lcol = sptr - bufcpy;
				/* look for right side in buffer */
	    for (sptr = buffer + strlen(buffer) - 1;  
		 strchr(WHITESPACE, *sptr) != NULL;
		 sptr --)
	      /* do nothing */ ;
	    if (sptr - buffer > blocks[blocknum].rcol)
	      blocks[blocknum].rcol = sptr - buffer;
	  }

				/* get the next line; blank line means end of block */
	do
	  {
	    if (fgets(buffer, LINEBUFLEN, fp) == NULL) 
	      { nptr = NULL; break; }
	    strcpy(bufcpy, buffer);

	    if      (strncmp(buffer, "#=SS",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS;
	    else if (strncmp(buffer, "#=SA",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA;
	    else if (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
	    else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;

	    if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) 
	      break;
	  } while (strchr(commentsyms, *nptr) != NULL);
      }


				/* check that number of sequences matches expected */
      if (blocknum == 0)
	num = currnum;
      else if (currnum != num)
	Die("Parse error in ReadSELEX()");
      blocknum++;

				/* get first line of next block 
				 * (non-comment, non-blank) */
      do
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; }
	  strcpy(bufcpy, buffer);
	}
      while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 
	     (strchr(commentsyms, *nptr) != NULL));
    }

  
  /***************************************************
   * Get ready for second pass:
   *   figure out the length of the alignment
   *   malloc space
   *   rewind the file
   ***************************************************/

  alen = 0;
  for (currblock = 0; currblock < blocknum; currblock++)
    alen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

  rewind(fp);

  /* allocations. we can't use AllocateAlignment because of
   * the way we already used ainfo->sqinfo.
   */
  aseqs     = (char **) MallocOrDie (num * sizeof(char *));
  if (have_cs) 
    ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char));
  if (have_rf) 
    ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char));

  
  
  for (i = 0; i < num; i++)
    {
      aseqs[i]     = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SS)
	ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char));
      if (ainfo->sqinfo[i].flags & SQINFO_SA)
	ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char));
    }
  
  ainfo->alen = alen;
  ainfo->nseq = num; 
  ainfo->wgt  = (float *) MallocOrDie (sizeof(float) * num);
  FSet(ainfo->wgt, num, 1.0);

  /***************************************************
   * Second pass across file. Parse header; assemble sequences
   ***************************************************/
  /* We've now made a complete first pass over the file. We know how
   * many blocks it contains, we know the number of seqs in the first
   * block, and we know every block has the same number of blocks;
   * so we can be a bit more cavalier about error-checking as we
   * make the second pass.
   */

  /* Look for header
   */
  headnum = 0;
  for (;;)
    {
      if (fgets(buffer, LINEBUFLEN, fp) == NULL)
	Die("Parse error in ReadSELEX()");
      strcpy(bufcpy, buffer);
      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */

      if (strcmp(nptr, "#=AU") == 0  && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->au = Strdup(sptr);
      else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->name = Strdup(sptr);
      else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->acc  = Strdup(sptr);
      else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
	ainfo->desc = Strdup(sptr);
      else if (strcmp(nptr, "#=GA") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=GA line in ReadSELEX()");
	  ainfo->ga2 = atof(sptr);

	  ainfo->flags |= AINFO_GA;
	}
      else if (strcmp(nptr, "#=TC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=TC line in ReadSELEX()");
	  ainfo->tc2 = atof(sptr);

	  ainfo->flags |= AINFO_TC;
	}
      else if (strcmp(nptr, "#=NC") == 0)
	{
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc1 = atof(sptr);

	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 
	    Die("Parse error in #=NC line in ReadSELEX()");
	  ainfo->nc2 = atof(sptr);

	  ainfo->flags |= AINFO_NC;
	}
      else if (strcmp(nptr, "#=SQ") == 0)      /* per-sequence header info */
	{
				/* first field is the name */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE;

				/* second field is the weight */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX()");
	  if (!IsReal(sptr)) 
	    Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number");
	  ainfo->wgt[headnum] = atof(sptr);

				/* third field is database source id */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID);

				/* fourth field is database accession number */
	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC);

				/* fifth field is start..stop::olen */
	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START);

	  if ((sptr = strtok(NULL, ".:")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP);
	  
	  if ((sptr = strtok(NULL, ":\t ")) == NULL)
	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN);

				/* rest of line is optional description */
	  if ((sptr = strtok(NULL, "\n")) != NULL)
	    SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC);
	  
	  headnum++;
	}
      else if (strcmp(nptr, "#=CS") == 0) break;
      else if (strcmp(nptr, "#=RF") == 0) break;
      else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */
    }
  

  currlen = 0;
  for (currblock = 0 ; currblock < blocknum; currblock++)
    {
				/* parse the block */
      seqidx = 0;
      while (nptr != NULL)
	{
				/* Consensus structure */
	  if (strcmp(nptr, "#=CS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=CS line in ReadSELEX()");
	    }

				/* Reference coordinates */
	  else if (strcmp(nptr, "#=RF") == 0)
	    {
	      if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=RF line in ReadSELEX()");
	    }
				/* Individual secondary structure */
	  else if (strcmp(nptr, "#=SS") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SS line in ReadSELEX()");
	    }

				/* Side chain % surface accessibility code */
	  else if (strcmp(nptr, "#=SA") == 0)
	    {
	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1,
					buffer, blocks[currblock].lcol, 
					blocks[currblock].rcol, (char) '.'))
		Die("Parse error in #=SA line in ReadSELEX()");
	    }
				/* Aligned sequence; avoid unparsed machine comments */
	  else if (strncmp(nptr, "#=", 2) != 0)
	    {
	      if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, 
					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
		Die("Parse error in alignment line in ReadSELEX()");
	      seqidx++;
	    }

				/* get next line */
	  for (;;)
	    {
	      nptr = NULL;
	      if (fgets(buffer, LINEBUFLEN, fp) == NULL) break;	/* EOF */
	      strcpy(bufcpy, buffer);
	      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */
	      if (strncmp(buffer, "#=", 2) == 0) break;      /* machine comment */
	      if (strchr(commentsyms, *nptr) == NULL) break; /* data */
	    }
	} /* end of a block */

      currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1;

				/* get line 1 of next block */
      for (;;)
	{
	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */
	  strcpy(bufcpy, buffer);
	  if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */
	  if (strncmp(buffer, "#=", 2) == 0)       break; /* machine comment */
	  if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */
	}
    } /* end of the file */

  /* Lengths in sqinfo are for raw sequence (ungapped),
   * and SS, SA are 0..rlen-1 not 0..alen-1.
   * Only the seqs with structures come out of here with lengths set.
   */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      int apos, rpos;
				/* secondary structures */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SS)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].ss[rpos] = '\0';
	}
				/* Surface accessibility */
      if (ainfo->sqinfo[seqidx].flags & SQINFO_SA)
	{
	  for (apos = rpos = 0; apos < alen; apos++)
	    if (! isgap(aseqs[seqidx][apos]))
	      {
		ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos];
		rpos++;
	      }
	  ainfo->sqinfo[seqidx].sa[rpos] = '\0';
	}
    }

				/* NULL-terminate all the strings */
  if (ainfo->rf != NULL) ainfo->rf[alen] = '\0';
  if (ainfo->cs != NULL) ainfo->cs[alen] = '\0';
  for (seqidx = 0; seqidx < num; seqidx++)
    aseqs[seqidx][alen]            = '\0';
  
				/* find raw sequence lengths for sqinfo */
  for (seqidx = 0; seqidx < num; seqidx++)
    {
      count = 0;
      for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++)
	if (!isgap(*sptr)) count++;
      ainfo->sqinfo[seqidx].len    = count;
      ainfo->sqinfo[seqidx].flags |= SQINFO_LEN;
    }


  /***************************************************
   * Garbage collection and return
   ***************************************************/
  free(blocks);
  if (warn_names) 
    Warn("sequences may be in different orders in blocks of %s?", afp->fname);

  /* Convert back to MSA structure. (Wasteful kludge.)
   */
  msa = MSAFromAINFO(aseqs, ainfo);
  MSAVerifyParse(msa);
  FreeAlignment(aseqs, ainfo);
  return msa;
}
Ejemplo n.º 12
0
static void
readSquid(struct ReadSeqVars *V)
{
  char *sptr;
  int   dostruc = FALSE;

  while (strncmp(V->sbuffer, "NAM ", 4) != 0) getline2(V);

  if ((sptr = strtok(V->sbuffer+4, "\n\t ")) != NULL)
    SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);

  /*CONSTCOND*/
  while (1)
    {
      getline2(V);
      if (feof(V->f)) {squid_errno = SQERR_FORMAT; return; }

      if (strncmp(V->sbuffer, "SRC ", 4) == 0)
	{
	  if ((sptr = strtok(V->sbuffer+4, " \t\n")) != NULL)
	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
	  if ((sptr = strtok(NULL, " \t\n")) != NULL)
	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
	  if ((sptr = strtok(NULL, ".")) != NULL)
	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_START);
	  if ((sptr = strtok(NULL, ".:")) != NULL)
	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_STOP);
	  if ((sptr = strtok(NULL, ": \t\n")) != NULL)
	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_OLEN);
	}
      else if (strncmp(V->sbuffer, "DES ", 4) == 0)
	{
	  if ((sptr = strtok(V->sbuffer+4, "\n")) != NULL)
	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
	}
      else if (strncmp(V->sbuffer,"SEQ", 3) == 0)
	break;
  }
  
  if (strstr(V->sbuffer, "+SS") != NULL) dostruc = TRUE;

  V->seqlen = 0;
  /*CONSTCOND*/
  while (1)
    {
				/* sequence line */
      getline2(V);
      if (feof(V->f) || strncmp(V->sbuffer, "++", 2) == 0) 
	break;
      addseq(V->sbuffer, V);
				/* structure line */
      if (dostruc)
	{
	  getline2(V);
	  if (feof(V->f)) { squid_errno = SQERR_FORMAT; return; }
	  addstruc(V->sbuffer, V);
	}
    }


  while (!feof(V->f) && strncmp(V->sbuffer, "NAM ", 4) != 0)
    getline2(V);
}