Ejemplo n.º 1
0
/* Function:  esl_msafile_psiblast_GuessAlphabet()
 * Synopsis:  Guess the alphabet of an open PSI-BLAST MSA file.
 *
 * Purpose:   Guess the alpbabet of the sequences in open
 *            PSI-BLAST format MSA file <afp>.
 *            
 *            On a normal return, <*ret_type> is set to <eslDNA>,
 *            <eslRNA>, or <eslAMINO>, and <afp> is reset to its
 *            original position.
 *
 * Args:      afp      - open PSI-BLAST format MSA file
 *            ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>       
 *
 * Returns:   <eslOK> on success.
 *            <eslENOALPHABET> if alphabet type can't be determined.
 *            In either case, <afp> is rewound to the position it
 *            started at.
 */
int
esl_msafile_psiblast_GuessAlphabet(ESLX_MSAFILE *afp, int *ret_type)
{
  int       alphatype     = eslUNKNOWN;
  esl_pos_t anchor        = -1;
  int       threshold[3]  = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
  int       nsteps        = 3;
  int       step          = 0;
  int       nres          = 0;
  int       x;
  int64_t   ct[26];
  char     *p, *tok;
  esl_pos_t n,  toklen, pos;
  int       status;

  for (x = 0; x < 26; x++) ct[x] = 0;

  anchor = esl_buffer_GetOffset(afp->bf);
  if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */

  while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
    {
      if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* blank lines */
      /* p now points to the rest of the sequence line, after a name */
      
      /* count characters into ct[] array */
      for (pos = 0; pos < n; pos++)
	if (isalpha(p[pos])) {
	  x = toupper(p[pos]) - 'A';
	  ct[x]++;
	  nres++; 
	}

      /* try to stop early, checking after 500, 5000, and 50000 residues: */
      if (step < nsteps && nres > threshold[step]) {
	if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
	step++;
      }
    }
  if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
  status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */

 DONE:
  esl_buffer_SetOffset(afp->bf, anchor);   /* Rewind to where we were. */
  esl_buffer_RaiseAnchor(afp->bf, anchor);
  *ret_type = alphatype;
  return status;

 ERROR:
  if (anchor != -1) {
    esl_buffer_SetOffset(afp->bf, anchor);
    esl_buffer_RaiseAnchor(afp->bf, anchor);
  }
  *ret_type = eslUNKNOWN;
  return status;
}
Ejemplo n.º 2
0
/* regurgitate_pfam_as_pfam()
 * 
 * Given an open Pfam formatted msafile, read the next alignment and
 * regurgitate it, after modifying it as necessary (change dna to rna,
 * wussify SS, etc) in Pfam format.
 * 
 * Returns <eslOK> on success. 
 * Returns <eslEOF> if there are no more alignments in <afp>.
 * Returns <eslEFORMAT> if parse fails because of a file format
 * problem, in which case afp->errmsg is set to contain a formatted
 * message that indicates the cause of the problem.
 */
static int
regurgitate_pfam_as_pfam(ESLX_MSAFILE *afp, FILE *ofp, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, int wussify, int dewuss, int fullwuss, char *rfrom, char *rto)
{
  char      *p;
  esl_pos_t  n;
  char      *first_seqname = NULL;
  char      *gx      = NULL;
  char      *seqname = NULL;
  char      *tag     = NULL;
  char      *text    = NULL;
  esl_pos_t  gxlen, namelen, taglen, textlen;
  int        nseq_read = 0;
  int        parse_gc_and_gr;
  int        flushpoint = 10000;
  int        exp_alen = -1;
  char      *buf      = NULL;
  esl_pos_t  pos, pos2;
  int        status;


  parse_gc_and_gr = (wussify || dewuss || fullwuss) ? TRUE : FALSE; /* should we parse out GR/GC lines and check if they're SS lines? */
  afp->errmsg[0] = '\0';
   
  /* Check the magic Stockholm header line.
   * We have to skip blank lines here, else we perceive
   * trailing blank lines in a file as a format error when
   * reading in multi-record mode.
   */
  /* Check the magic Stockholm header line, allowing blank lines */
  do { 
    status = eslx_msafile_GetLine(afp, &p, &n);
    if      (status == eslEOF) return eslEOF; 
    else if (status != eslOK)  esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber);
    fprintf(ofp, "%.*s\n", (int) afp->n, afp->line);
  } while (esl_memspn(afp->line, afp->n, " \t") == afp->n  ||                 /* skip blank lines             */
	       (esl_memstrpfx(afp->line, afp->n, "#")                         /* and skip comment lines       */
	   && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM")));            /* but stop on Stockholm header */

  if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber);

  /* Read the alignment file one line at a time.  */
  while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) 
    {
      if ((int) afp->linenumber % flushpoint == 0) fflush(ofp);
      while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */
    
      if      (!n)                          fprintf(ofp, "\n");
      else if (esl_memstrpfx(p, n, "//")) { fprintf(ofp, "//\n"); break; } /* normal way out */
      else if (*p == '#') 
	{
	  if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GC")) 
	    {  	/* parse line into temporary strings */
	      if (esl_memtok(&p, &n, " \t",  &gx,   &gxlen)   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n, " \t",  &tag,  &taglen)  != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n,  " \t", &text, &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber);
	      pos = text - afp->line; /* pos: position of first aligned char on line; total width of annotation tag w/spaces */
	
	      /* verify alignment length */
	      if      (exp_alen == -1)      exp_alen = textlen;
	      else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen);
	
	      /* we need to make a writable string copy of the annotation, to edit it */
	      ESL_REALLOC(buf, sizeof(char) * (textlen+1));
	      esl_memstrcpy(text, textlen, buf);
	     
	      if (esl_memstrcmp(tag, taglen, "SS_cons")) 
		{
		  if      (wussify)  esl_kh2wuss(buf, buf);
		  else if (dewuss)   esl_wuss2kh(buf, buf);
		  else if (fullwuss) 
		    { 
		      status = esl_wuss_full(buf, buf);
		      if      (status == eslESYNTAX) esl_fatal("Bad SS_cons line: not in WUSS format, alifile line: %d", (int) afp->linenumber);
		      else if (status != eslOK)      esl_fatal("Conversion of SS_cons line failed, code %d, alifile line: %d", status, (int) afp->linenumber);
		    }
		}		  
	      fprintf(ofp, "#=GC %.*s%*s%s\n", (int) taglen, tag, (int) (pos-taglen-5), "", buf);
	    }
	  else if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GR") == 0) 
	    { 
	      /* parse line into temporary strings */
	      if (esl_memtok(&p, &n, " \t", &gx,      &gxlen)   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n, " \t", &tag,     &taglen)  != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      pos = tag   - afp->line;
	      if (esl_memtok(&p, &n, " \t", &text,    &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      pos2 = text - afp->line;

	      /* we need to make a writable string copy of the annotation, to edit it */
	      ESL_REALLOC(buf, sizeof(char) * (textlen+1));
	      esl_memstrcpy(text, textlen, buf);

	      /* verify alignment length */
	      if      (exp_alen == -1)      exp_alen = textlen;
	      else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen);
	
	      if (esl_memstrcmp(tag, taglen, "SS") == 0) 
		{
		  if      (wussify)  esl_kh2wuss(buf, buf);
		  else if (dewuss)   esl_wuss2kh(buf, buf);
		  else if (fullwuss) { 
		    status = esl_wuss_full(buf, buf);
		    if      (status == eslESYNTAX) esl_fatal("Bad SS line: not in WUSS format, alifile line: %d", (int) afp->linenumber);
		    else if (status != eslOK)      esl_fatal("Conversion of SS line failed, code %d, alifile line: %d", status, (int) afp->linenumber);
		  }
		}		  

	      fprintf(ofp, "#=GR %.*s%*s%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen-5), "", (int) taglen, tag, (int) (pos2-pos-taglen), "", buf);
	    }
	  else { /* '#' prefixed line that is not #=GR (or it is #=GR and wussify,dewuss,fullwuss are all FALSE) */
	    fprintf(ofp, "%.*s\n", (int) afp->n, afp->line); /* print the line */
	  }
	} /* end of 'if (*s == '#')' */ 
      else 
	{ /* sequence line */
	  if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber);
	  if (esl_memtok(&p, &n, " \t", &text,    &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber);
	  pos = text - afp->line;

	  /* verify alignment length */
	  if     (exp_alen == -1)      exp_alen = textlen;
	  else if(exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen);
      
	  /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */
	  if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, namelen, &(first_seqname))) != eslOK) goto ERROR; }
	  else if (esl_memstrcmp(seqname, namelen, first_seqname)) { ESL_XFAIL(eslEFORMAT, afp->errmsg, "parse failed (line %d): two seqs named %s. Alignment appears to be in Stockholm format. Reformat to Pfam with esl-reformat.", (int) afp->linenumber, seqname); }
	  nseq_read++;
      
	  /* we need to make a writable string copy of the annotation, to edit it */
	  ESL_REALLOC(buf, sizeof(char) * (textlen+1));
	  esl_memstrcpy(text, textlen, buf);

	  /* make adjustments as necessary */
	  if (rfrom)       symconvert(buf, rfrom, rto);
	  if (gapsym)      symconvert(buf, "-_.", gapsym);
	  if (force_lower) symconvert(buf,
				      "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
				      "abcdefghijklmnopqrstuvwxyz");
	  if (force_upper) symconvert(buf,
				      "abcdefghijklmnopqrstuvwxyz",
				      "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
	  if (force_rna)   symconvert(buf, "Tt", "Uu");
	  if (force_dna)   symconvert(buf, "Uu", "Tt");
	  if (iupac_to_n)  symconvert(buf, 
				      "RYMKSWHBVDrymkswhbvd",
				      "NNNNNNNNNNnnnnnnnnnn");
	  if (x_is_bad)    symconvert(buf,   "Xx", "Nn");
	  /* print it out */
	  fprintf(ofp, "%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen), "", buf);
	}
    }

  /* If we saw a normal // end, we would've successfully read a line,
   * so when we get here, status (from the line read) should be eslOK.
   */ 
  if (status != eslOK) esl_fatal("--small parse failed (line %d): didn't find // at end of alignment", (int) afp->linenumber);
  if (first_seqname) free(first_seqname);
  if (buf)           free(buf);
  return eslOK;

 ERROR:
  return status;
}
Ejemplo n.º 3
0
/* regurgitate_pfam_as_afa()
 * 
 * Given an open Pfam formatted msafile, read the next alignment and 
 * regurgitate it in aligned FASTA (AFA) format without storing
 * it in a esl_msa data structure.
 * 
 * We need to do two passes through the file because in Pfam
 * sequence accessions (#=GS <seqname> AC) and sequence descriptions
 * (#=GS <seqname> DE) appear altogether before any aligned sequence
 * data, while in AFA they appear on the same line as the sequence
 * name (accession, then description).
 *
 * Example: 
 * # STOCKHOLM 1.0
 * #=GS tRNA1 AC RF00005-1
 * #=GS tRNA2 AC RF00005-2
 * #=GS tRNA1 DE first tRNA
 * #=GS tRNA2 DE second tRNA
 * 
 * tRNA1 GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCA
 * tRNA2 UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGACUCCCCGUAUCGGAG
 * 
 * converts to AFA:
 * >tRNA1 RF00005-1 first tRNA
 * GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAU
 * CCACAGAAUUCGCA
 * >tRNA2 RF00005-2 second tRNA
 * UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGAC
 * UCCCCGUAUCGGAG
 * 
 * In the first pass, output the sequence names and accessions we find
 * as '#=GS <seqname> AC' lines in the Pfam alignment to an accession
 * tmpfile, and output sequence names and descriptions we find as 
 * as '#=GS <seqname> DE' lines in the Pfam alignment to a description
 * tmpfile.
 *
 * In the second pass, rewind all (up to 3) files: <ac_tmpfile>,
 * <de_tmpfile> and the Pfam alignment file and start reading them
 * again.  As we're reading them, output the accessions, descriptions
 * and aligned sequence data in the proper order to an aligned FASTA
 * file.
 * 
 * Set <ret_reached_eof> as TRUE if the alignment read and reformatted
 * appears to be the only one remaining in afp.  Set <ret_reached_eof>
 * as FALSE if afp appears to include at least one more alignment.
 * 
 * Returns void. Dies upon any input error.
 */
static void
regurgitate_pfam_as_afa(ESLX_MSAFILE *afp, FILE *ofp, char *alifile, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, char *rename, char *rfrom, char *rto, int *ret_reached_eof)
{
  char      *p = NULL;
  esl_pos_t  n = 0;
  esl_pos_t  gslen, seqnamelen, taglen;
  char      *seqname = NULL;
  char      *first_seqname = NULL;
  char      *tag = NULL;
  char      *gs = NULL;
  int        nseq_read = 0;
  int        reached_eof;
  /* variables related to reading accessions */
  char       ac_tmpfile[16] = "esltmpXXXXXX";
  FILE      *ac_fp = NULL; /* file ptr for accession tmpfile */
  char      *ac_buf = NULL;	/* buffer for line input w/ sre_fgets()      */
  int        ac_buflen = 0;	/* current allocated length for buf          */
  char      *ac_s = NULL;	        
  char      *ac_seqname = NULL;
  char      *ac = NULL;
  int        have_ac = FALSE;
  /* variables related to reading descriptions */
  char       de_tmpfile[16] = "esltmpXXXXXX";
  FILE      *de_fp = NULL; /* file ptr for description tmpfile */
  char      *de_buf = NULL;	/* buffer for line input w/ sre_fgets()      */
  int        de_buflen = 0;	/* current allocated length for buf          */
  char      *de_s = NULL;	        
  char      *de_seqname = NULL;
  char      *de = NULL;
  int        have_de = FALSE;
  /* variables related to printing out sequences */
  char      *aseq = NULL;
  esl_pos_t  aseqlen = 0;
  int64_t    apos;
  char       aseqbuf[61];
  int        cpl = 60;	     /* number of residues per afa seq line */
  int        acpl;       /* actual number of character per line */
  int        status;
  
  afp->errmsg[0] = '\0';
   
  /**************************************************************************************************
   * First pass, go through each line of the Pfam file and output all GS DE and AC annotation to tmpfiles
   **************************************************************************************************/

  /* Check the magic Stockholm header line, allowing blank lines */
  do { 
    status = eslx_msafile_GetLine(afp, &p, &n);
    if      (status == eslEOF) return; 
    else if (status != eslOK)  esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber);
  } while (esl_memspn(afp->line, afp->n, " \t") == afp->n  ||                 /* skip blank lines             */
	       (esl_memstrpfx(afp->line, afp->n, "#")                         /* and skip comment lines       */
	   && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM")));            /* but stop on Stockholm header */

  if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber);

  while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) 
    {
      while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */

      if (esl_memstrpfx(p, n, "#=GS"))
	{ /* only lines we need to check are AC and DE lines, we don't even check other lines for validity */
	  if (esl_memtok(&p, &n, " \t", &gs,      &gslen)      != eslOK) esl_fatal("small mem parse failed (line %d) in a way that can't happen",                      (int) afp->linenumber);
	  if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <seqname>, <tag>, annotation", (int) afp->linenumber);
	  if (esl_memtok(&p, &n, " \t", &tag,     &taglen)     != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <tag>, annotation",            (int) afp->linenumber);
	  if (! esl_memstrcmp(gs, gslen, "#=GS"))                        esl_fatal("small mem parse failed (line %d): faux #=GS line?",                                (int) afp->linenumber);

	  if (esl_memstrcmp(tag, taglen, "AC"))
	    { 
	      if (! ac_fp && esl_tmpfile(ac_tmpfile, &ac_fp) != eslOK) esl_fatal("small mem parse failed, unable to open accession tmpfile");
	      fprintf(ac_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p);
	    }
	  if (esl_memstrcmp(tag, taglen, "DE"))
	    { 
	      if (! de_fp && esl_tmpfile(de_tmpfile, &de_fp) != eslOK) esl_fatal("small mem parse failed, unable to open description tmpfile");
	      fprintf(de_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p);
	    }
	}
      else if (esl_memstrpfx(p, n, "//")) break;
    }
  if      (status == eslEOF) esl_fatal("small mem parse failed (line %d): missing // terminator", (int) afp->linenumber);
  else if (status != eslOK)  esl_fatal("small mem parse failed (line %d) with code %d", (int) afp->linenumber, status);
  
  /* The regurgitate_*() functions are limited, and only deal with single-record Pfam files. 
   * If there appears to be more data in the file, drop the reached_eof flag.
   */
  while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) 
    {
      while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */
      if    (esl_memstrpfx(p, n, "# STOCKHOLM 1.")) break;
      if    (n && ! esl_memstrpfx(p, n, "#"))       esl_fatal("small mem parse failed (line %d): unexpected data", (int) afp->linenumber);
    }
  if      (status == eslOK)  reached_eof = FALSE;
  else if (status == eslEOF) reached_eof = TRUE;
  else esl_fatal("--small parse error. problem reading line %d of msafile", (int) afp->linenumber);

  /*****************************************************************
   * Pass 1 complete; rewind (close/reopen) all files
   *****************************************************************/

  eslx_msafile_Close(afp);
  if ((status = eslx_msafile_Open(NULL, alifile, NULL, eslMSAFILE_PFAM, NULL, &afp)) != eslOK)
    esl_fatal("--small, second pass, unable to open file %s for reading", alifile);
  
  if (ac_fp) { /* open the tmpfile with the seq accessions */
    rewind(ac_fp);
    if((status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp)) != eslOK) esl_fatal("--small accession tmpfile parse failed");
    ac_s = ac_buf;
    if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed");
    if (esl_strtok_adv(&ac_s, "\n\r",    &ac,         NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed");
  }
  if (de_fp) { /* open the tmpfile with the seq descriptions */
    rewind(de_fp);
    if((status = esl_fgets(&(de_buf), &(de_buflen), de_fp)) != eslOK) esl_fatal("--small description tmpfile parse failed");
    de_s = de_buf;
    if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed");
    if (esl_strtok_adv(&de_s, "\n\r",    &de,         NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed");
  }

  /******************************************************************************************
   * Pass 2, step through files, outputting appropriately
   ******************************************************************************************/

  do { 
    status = eslx_msafile_GetLine(afp, &p, &n);
    if      (status == eslEOF) return; 
    else if (status != eslOK)  esl_fatal("small mem parse pass 2 error. problem reading line %d of msafile", (int) afp->linenumber);
  } while (esl_memspn(afp->line, afp->n, " \t") == afp->n  ||                 /* skip blank lines             */
	       (esl_memstrpfx(afp->line, afp->n, "#")                         /* and skip comment lines       */
	   && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM")));            /* but stop on Stockholm header */

  if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse pass 2 failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber);

  while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) 
    {
      while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */

      if      (!n || *p == '#')           continue;	    /* skip blank lines, comments */
      else if (esl_memstrpfx(p, n, "//")) break;	    /* end of alignment: end of record */
      else 
	{ /* sequence line. parse line into temporary strings */
	  if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no seq name", (int) afp->linenumber);
	  if (esl_memtok(&p, &n, " \t", &aseq,    &aseqlen)    != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no aseq",     (int) afp->linenumber);

	  /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */
	  if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, seqnamelen, &(first_seqname))) != eslOK) esl_fatal("small mem parse failed: unable to copy seqname"); }
	  else if (esl_memstrcmp(seqname, seqnamelen, first_seqname)) esl_fatal("--small parse pass 2 failed (line %d): two seqs named %s. Alignment appears to be in interleaved Stockholm (not Pfam) format.", (int) afp->linenumber, seqname); 
	  nseq_read++;

	  /* determine if we have an accession and/or description for this sequence */
	  have_de = have_ac = FALSE;
	  if (ac_seqname && (esl_memstrcmp(seqname, seqnamelen, ac_seqname))) have_ac = TRUE;
	  if (de_seqname && (esl_memstrcmp(seqname, seqnamelen, de_seqname))) have_de = TRUE;

	  if (rename) fprintf(ofp, ">%s.%d%s%s%s%s\n",          rename, nseq_read, (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); 
	  else        fprintf(ofp, ">%.*s%s%s%s%s\n", (int) seqnamelen, seqname,   (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); 

	  /* load next ac, de */
	  if (have_ac) {
	    status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp);
	    if      (status == eslEOF) ac_seqname = NULL;
	    else if (status == eslOK) { 
	      ac_s = ac_buf;
	      if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed");
	      if (esl_strtok_adv(&ac_s, "\n\r",    &ac,         NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed");
	    }
	  }
	  if (have_de) {
	    status = esl_fgets(&(de_buf), &(de_buflen), de_fp);
	    if(status == eslEOF) de_seqname = NULL;
	    else if (status == eslOK) { 
	      de_s = de_buf;
	      if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed");
	      if (esl_strtok_adv(&de_s, "\n\r",    &de,         NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed");
	    }
	  }

	  /* now print sequence, after converting symbols as nec */
	  /* remember, aseq itself is part of an ESL_BUFFER and you
	     can't write to it, so symconverts have to be on the
	     copy */
	  for (apos = 0; apos < aseqlen; apos += cpl)
	    {
	      acpl = (aseqlen - apos > cpl ? cpl : aseqlen - apos);
	      strncpy(aseqbuf, aseq + apos, acpl);
	      aseqbuf[acpl] = '\0';

	      if (rfrom)       symconvert(aseqbuf, rfrom, rto);
	      if (gapsym)      symconvert(aseqbuf, "-_.", gapsym);
	      if (force_lower) symconvert(aseqbuf,
					  "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
					  "abcdefghijklmnopqrstuvwxyz");
	      if (force_upper) symconvert(aseqbuf,
					  "abcdefghijklmnopqrstuvwxyz",
					  "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
	      if (force_rna)   symconvert(aseqbuf, "Tt", "Uu");
	      if (force_dna)   symconvert(aseqbuf, "Uu", "Tt");
	      if (iupac_to_n)  symconvert(aseqbuf, 
					  "RYMKSWHBVDrymkswhbvd",
					  "NNNNNNNNNNnnnnnnnnnn");
	      if (x_is_bad)    symconvert(aseqbuf,   "Xx", "Nn");

	      fprintf(ofp, "%s\n", aseqbuf);	      
	    }
	}
    }
  /* If we saw a normal // end, we would've successfully read a line,
   * so when we get here, status (from the line read) should be eslOK.
   */ 
  if (status != eslOK) esl_fatal("--small parse pass 2 failed (line %d): didn't find // at end of alignment", (int) afp->linenumber);
  if (ac_seqname)      esl_fatal("--small parse pass 2 failed, sequence %s with #=GS AC line does not exist in alignment or is in different order.", ac_seqname);
  if (de_seqname)      esl_fatal("--small parse pass 2 failed, sequence %s with #=GS DE line does not exist in alignment or is in different order.", de_seqname);

  if (ac_fp) fclose(ac_fp);
  if (de_fp) fclose(de_fp);
  eslx_msafile_Close(afp);

  if (first_seqname) free(first_seqname);
  if (ac_buf)        free(ac_buf);
  if (de_buf)        free(de_buf);

  *ret_reached_eof = reached_eof;
  return;
}
Ejemplo n.º 4
0
/* Function:  esl_msafile_a2m_Read()
 * Synopsis:  Read a UCSC A2M format alignment.
 *
 * Purpose:   Read an MSA from an open <ESL_MSAFILE> <afp>, parsing
 *            for UCSC A2M (SAM) format. Create a new MSA,
 *            and return a ptr to it in <*ret_msa>. Caller is responsible
 *            for freeing this <ESL_MSA>.
 *            
 *            The <msa> has a reference line (<msa->rf[]>) that
 *            corresponds to the uppercase/lowercase columns in the
 *            alignment: consensus (uppercase) columns are marked 'X',
 *            and insert (lowercase) columns are marked '.' in the RF
 *            annotation line.
 *
 *            This input parser can deal both with "dotless" A2M, and
 *            full A2M format with dots.
 *            
 * Args:      afp     - open <ESL_MSAFILE>
 *            ret_msa - RETURN: newly parsed <ESL_MSA>
 *
 * Returns:   <eslOK> on success. <*ret_msa> is set to the newly
 *            allocated MSA, and <afp> is at EOF.
 *
 *            <eslEOF> if no (more) alignment data are found in
 *            <afp>, and <afp> is returned at EOF. 
 *
 *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
 *            <NULL>. <afp> contains information sufficient for
 *            constructing useful diagnostic output: 
 *            | <afp->errmsg>       | user-directed error message     |
 *            | <afp->linenumber>   | line # where error was detected |
 *            | <afp->line>         | offending line (not NUL-term)   |
 *            | <afp->n>            | length of offending line        |
 *            | <afp->bf->filename> | name of the file                |
 *            and <afp> is poised at the start of the following line,
 *            so (in principle) the caller could try to resume
 *            parsing.
 *            
 * Throws:    <eslEMEM> - an allocation failed.
 *            <eslESYS> - a system call such as fread() failed
 *            <eslEINCONCEIVABLE> - "impossible" corruption 
 *            On these, <*ret_msa> is returned <NULL>, and the state of
 *            <afp> is undefined.
 */
int
esl_msafile_a2m_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
{
  ESL_MSA  *msa        = NULL;
  char    **csflag     = NULL;	/* csflag[i][pos] is TRUE if aseq[i][pos] was uppercase consensus   */
  int      *nins       = NULL;	/* # of inserted residues before each consensus col [0..ncons-1]    */
  int      *this_nins  = NULL;	/* # of inserted residues before each consensus residue in this seq */
  int       nseq       = 0;
  int       ncons      = 0;
  int       idx;
  int64_t   thislen;
  int64_t   spos;
  int       this_ncons;
  int       cpos, bpos;
  char     *p, *tok;
  esl_pos_t n,  toklen;
  int       status;

  ESL_DASSERT1( (afp->format == eslMSAFILE_A2M) );

  afp->errmsg[0] = '\0';	

#ifdef eslAUGMENT_ALPHABET
  if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
#endif
  if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
  ESL_ALLOC(csflag, sizeof(char *) * msa->sqalloc);
  for (idx = 0; idx < msa->sqalloc; idx++) csflag[idx] = NULL; 

  /* skip leading blank lines in file */
  while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK  && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
  if      (status != eslOK) goto ERROR; /* includes normal EOF */

  /* tolerate sloppy space at start of name/desc line */
  while (n && isspace(*p)) { p++; n--; }    
  if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected A2M name/desc line starting with >");    

  do {	/* for each record starting in '>': */
    p++; n--; 			/* advance past > */
    
    if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen))   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for A2M record");
    if (nseq >= msa->sqalloc) {
      int old_sqalloc = msa->sqalloc;
      if ( (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
      ESL_REALLOC(csflag, sizeof(char *) * msa->sqalloc);
      for (idx = old_sqalloc; idx < msa->sqalloc; idx++) csflag[idx] = NULL;
    }

    if (     (status = esl_msa_SetSeqName       (msa, nseq, tok, toklen)) != eslOK) goto ERROR;
    if (n && (status = esl_msa_SetSeqDescription(msa, nseq, p,   n))      != eslOK) goto ERROR;

    /* now for each sequence line... */
    thislen = 0;		/* count of lowercase, uppercase, and '-': w/o dots, on first pass */
    this_ncons = 0;		/* count of uppercase + '-': number of consensus columns in alignment: must match for all seqs */
    if (nseq) {
      for (cpos = 0; cpos <= ncons; cpos++) // A little tricksy. <this_nins> is allocated on first seq, when nseq=0. 
	this_nins[cpos] = 0;                // cppcheck gets confused and erroneously calls "possible null pointer deference"; ignore it.
    }

    while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK)
      {				
	while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */
	if (n  == 0)   continue;	       /* tolerate and skip blank lines */
	if (*p == '>') break;

	ESL_REALLOC(csflag[nseq], sizeof(char) * (thislen + n + 1)); /* might be an overalloc by a bit, depending on whitespace on line */
	if (nseq == 0) {
	  ESL_REALLOC(this_nins, sizeof(int) * (this_ncons + n + 1));
	  for (cpos = this_ncons; cpos <= this_ncons+n; cpos++)
	    this_nins[cpos] = 0;
	}

	for (spos = thislen, bpos = 0; bpos < n; bpos++)
	  {
	    if      (p[bpos] == 'O')   continue;
	    else if (isupper(p[bpos])) { csflag[nseq][spos++] = TRUE;  this_ncons++;            }
	    else if (islower(p[bpos])) { csflag[nseq][spos++] = FALSE; this_nins[this_ncons]++; }
	    else if (p[bpos] == '-')   { csflag[nseq][spos++] = TRUE;  this_ncons++;            }
	    if (ncons && this_ncons > ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg,  "unexpected # of consensus residues, didn't match previous seq(s)");
	  }
	csflag[nseq][spos] = TRUE; /* need a sentinel, because of the way the padding functions work */

#ifdef eslAUGMENT_ALPHABET
	if (msa->abc)   { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[nseq]),   &thislen, p, n); } 
#endif
	if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[nseq]), &thislen, p, n); }
	if (status == eslEINVAL)   ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
	else if (status != eslOK)  goto ERROR;
	ESL_DASSERT1( (spos == thislen) );
      }	
    if (status != eslOK && status != eslEOF) goto ERROR; /* exception thrown by esl_msafile_GetLine() */
    /* status == OK: then *p == '>'. status == eslEOF: we're eof.  status == anything else: error */
    /* Finished reading a sequence record. */
    
    if (nseq == 0) 
      {
	ncons = this_ncons;
	ESL_ALLOC(nins, sizeof(int) * (ncons+1));
	for (cpos = 0; cpos <= ncons; cpos++)
	  nins[cpos] = this_nins[cpos];
      } 
    else 
      {
	if (this_ncons != ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)");
	for (cpos = 0; cpos <= ncons; cpos++) 
	  nins[cpos]      = ESL_MAX(nins[cpos], this_nins[cpos]);
      }
    nseq++;
  } while (status == eslOK);
  
  /* Now we have nseq *unaligned* sequences in ax/aseq[0..nseq-1]; call the length slen, though we don't explicitly store it
   * csflag[idx][spos] tells us whether each unaligned residue is an insertion or consensus, for spos==0..slen-1.
   * nins[0..ncons] tells us the max number of inserted residues before each consensus column
   * This is sufficient information to reconstruct each aligned sequence.
   */
  msa->nseq = nseq;
#ifdef eslAUGMENT_ALPHABET
  if (msa->abc)  { if ((status = a2m_padding_digital(msa, csflag, nins, ncons)) != eslOK) goto ERROR; }
#endif
  if (!msa->abc) { if ((status = a2m_padding_text   (msa, csflag, nins, ncons)) != eslOK) goto ERROR; }

  if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;

  *ret_msa  = msa;
  free(nins);
  free(this_nins);
  for (idx = 0; idx < msa->nseq; idx++) free(csflag[idx]);
  free(csflag);
  return eslOK;
  
 ERROR:
  if (nins)      free(nins);
  if (this_nins) free(this_nins);
  if (csflag) {
    for (idx = 0; idx < msa->nseq; idx++) 
      if (csflag[idx]) free(csflag[idx]);
    free(csflag);
  }
  if (msa) esl_msa_Destroy(msa);
  return status;
}