/* Function: esl_msafile_psiblast_GuessAlphabet() * Synopsis: Guess the alphabet of an open PSI-BLAST MSA file. * * Purpose: Guess the alpbabet of the sequences in open * PSI-BLAST format MSA file <afp>. * * On a normal return, <*ret_type> is set to <eslDNA>, * <eslRNA>, or <eslAMINO>, and <afp> is reset to its * original position. * * Args: afp - open PSI-BLAST format MSA file * ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO> * * Returns: <eslOK> on success. * <eslENOALPHABET> if alphabet type can't be determined. * In either case, <afp> is rewound to the position it * started at. */ int esl_msafile_psiblast_GuessAlphabet(ESLX_MSAFILE *afp, int *ret_type) { int alphatype = eslUNKNOWN; esl_pos_t anchor = -1; int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */ int nsteps = 3; int step = 0; int nres = 0; int x; int64_t ct[26]; char *p, *tok; esl_pos_t n, toklen, pos; int status; for (x = 0; x < 26; x++) ct[x] = 0; anchor = esl_buffer_GetOffset(afp->bf); if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */ while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK) { if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* blank lines */ /* p now points to the rest of the sequence line, after a name */ /* count characters into ct[] array */ for (pos = 0; pos < n; pos++) if (isalpha(p[pos])) { x = toupper(p[pos]) - 'A'; ct[x]++; nres++; } /* try to stop early, checking after 500, 5000, and 50000 residues: */ if (step < nsteps && nres > threshold[step]) { if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */ step++; } } if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */ status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */ DONE: esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */ esl_buffer_RaiseAnchor(afp->bf, anchor); *ret_type = alphatype; return status; ERROR: if (anchor != -1) { esl_buffer_SetOffset(afp->bf, anchor); esl_buffer_RaiseAnchor(afp->bf, anchor); } *ret_type = eslUNKNOWN; return status; }
/* regurgitate_pfam_as_pfam() * * Given an open Pfam formatted msafile, read the next alignment and * regurgitate it, after modifying it as necessary (change dna to rna, * wussify SS, etc) in Pfam format. * * Returns <eslOK> on success. * Returns <eslEOF> if there are no more alignments in <afp>. * Returns <eslEFORMAT> if parse fails because of a file format * problem, in which case afp->errmsg is set to contain a formatted * message that indicates the cause of the problem. */ static int regurgitate_pfam_as_pfam(ESLX_MSAFILE *afp, FILE *ofp, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, int wussify, int dewuss, int fullwuss, char *rfrom, char *rto) { char *p; esl_pos_t n; char *first_seqname = NULL; char *gx = NULL; char *seqname = NULL; char *tag = NULL; char *text = NULL; esl_pos_t gxlen, namelen, taglen, textlen; int nseq_read = 0; int parse_gc_and_gr; int flushpoint = 10000; int exp_alen = -1; char *buf = NULL; esl_pos_t pos, pos2; int status; parse_gc_and_gr = (wussify || dewuss || fullwuss) ? TRUE : FALSE; /* should we parse out GR/GC lines and check if they're SS lines? */ afp->errmsg[0] = '\0'; /* Check the magic Stockholm header line. * We have to skip blank lines here, else we perceive * trailing blank lines in a file as a format error when * reading in multi-record mode. */ /* Check the magic Stockholm header line, allowing blank lines */ do { status = eslx_msafile_GetLine(afp, &p, &n); if (status == eslEOF) return eslEOF; else if (status != eslOK) esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber); fprintf(ofp, "%.*s\n", (int) afp->n, afp->line); } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* skip blank lines */ (esl_memstrpfx(afp->line, afp->n, "#") /* and skip comment lines */ && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM"))); /* but stop on Stockholm header */ if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber); /* Read the alignment file one line at a time. */ while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { if ((int) afp->linenumber % flushpoint == 0) fflush(ofp); while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (!n) fprintf(ofp, "\n"); else if (esl_memstrpfx(p, n, "//")) { fprintf(ofp, "//\n"); break; } /* normal way out */ else if (*p == '#') { if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GC")) { /* parse line into temporary strings */ if (esl_memtok(&p, &n, " \t", &gx, &gxlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &text, &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber); pos = text - afp->line; /* pos: position of first aligned char on line; total width of annotation tag w/spaces */ /* verify alignment length */ if (exp_alen == -1) exp_alen = textlen; else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen); /* we need to make a writable string copy of the annotation, to edit it */ ESL_REALLOC(buf, sizeof(char) * (textlen+1)); esl_memstrcpy(text, textlen, buf); if (esl_memstrcmp(tag, taglen, "SS_cons")) { if (wussify) esl_kh2wuss(buf, buf); else if (dewuss) esl_wuss2kh(buf, buf); else if (fullwuss) { status = esl_wuss_full(buf, buf); if (status == eslESYNTAX) esl_fatal("Bad SS_cons line: not in WUSS format, alifile line: %d", (int) afp->linenumber); else if (status != eslOK) esl_fatal("Conversion of SS_cons line failed, code %d, alifile line: %d", status, (int) afp->linenumber); } } fprintf(ofp, "#=GC %.*s%*s%s\n", (int) taglen, tag, (int) (pos-taglen-5), "", buf); } else if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GR") == 0) { /* parse line into temporary strings */ if (esl_memtok(&p, &n, " \t", &gx, &gxlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber); pos = tag - afp->line; if (esl_memtok(&p, &n, " \t", &text, &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber); pos2 = text - afp->line; /* we need to make a writable string copy of the annotation, to edit it */ ESL_REALLOC(buf, sizeof(char) * (textlen+1)); esl_memstrcpy(text, textlen, buf); /* verify alignment length */ if (exp_alen == -1) exp_alen = textlen; else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen); if (esl_memstrcmp(tag, taglen, "SS") == 0) { if (wussify) esl_kh2wuss(buf, buf); else if (dewuss) esl_wuss2kh(buf, buf); else if (fullwuss) { status = esl_wuss_full(buf, buf); if (status == eslESYNTAX) esl_fatal("Bad SS line: not in WUSS format, alifile line: %d", (int) afp->linenumber); else if (status != eslOK) esl_fatal("Conversion of SS line failed, code %d, alifile line: %d", status, (int) afp->linenumber); } } fprintf(ofp, "#=GR %.*s%*s%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen-5), "", (int) taglen, tag, (int) (pos2-pos-taglen), "", buf); } else { /* '#' prefixed line that is not #=GR (or it is #=GR and wussify,dewuss,fullwuss are all FALSE) */ fprintf(ofp, "%.*s\n", (int) afp->n, afp->line); /* print the line */ } } /* end of 'if (*s == '#')' */ else { /* sequence line */ if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &text, &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber); pos = text - afp->line; /* verify alignment length */ if (exp_alen == -1) exp_alen = textlen; else if(exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen); /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */ if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, namelen, &(first_seqname))) != eslOK) goto ERROR; } else if (esl_memstrcmp(seqname, namelen, first_seqname)) { ESL_XFAIL(eslEFORMAT, afp->errmsg, "parse failed (line %d): two seqs named %s. Alignment appears to be in Stockholm format. Reformat to Pfam with esl-reformat.", (int) afp->linenumber, seqname); } nseq_read++; /* we need to make a writable string copy of the annotation, to edit it */ ESL_REALLOC(buf, sizeof(char) * (textlen+1)); esl_memstrcpy(text, textlen, buf); /* make adjustments as necessary */ if (rfrom) symconvert(buf, rfrom, rto); if (gapsym) symconvert(buf, "-_.", gapsym); if (force_lower) symconvert(buf, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"); if (force_upper) symconvert(buf, "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); if (force_rna) symconvert(buf, "Tt", "Uu"); if (force_dna) symconvert(buf, "Uu", "Tt"); if (iupac_to_n) symconvert(buf, "RYMKSWHBVDrymkswhbvd", "NNNNNNNNNNnnnnnnnnnn"); if (x_is_bad) symconvert(buf, "Xx", "Nn"); /* print it out */ fprintf(ofp, "%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen), "", buf); } } /* If we saw a normal // end, we would've successfully read a line, * so when we get here, status (from the line read) should be eslOK. */ if (status != eslOK) esl_fatal("--small parse failed (line %d): didn't find // at end of alignment", (int) afp->linenumber); if (first_seqname) free(first_seqname); if (buf) free(buf); return eslOK; ERROR: return status; }
/* regurgitate_pfam_as_afa() * * Given an open Pfam formatted msafile, read the next alignment and * regurgitate it in aligned FASTA (AFA) format without storing * it in a esl_msa data structure. * * We need to do two passes through the file because in Pfam * sequence accessions (#=GS <seqname> AC) and sequence descriptions * (#=GS <seqname> DE) appear altogether before any aligned sequence * data, while in AFA they appear on the same line as the sequence * name (accession, then description). * * Example: * # STOCKHOLM 1.0 * #=GS tRNA1 AC RF00005-1 * #=GS tRNA2 AC RF00005-2 * #=GS tRNA1 DE first tRNA * #=GS tRNA2 DE second tRNA * * tRNA1 GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCA * tRNA2 UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGACUCCCCGUAUCGGAG * * converts to AFA: * >tRNA1 RF00005-1 first tRNA * GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAU * CCACAGAAUUCGCA * >tRNA2 RF00005-2 second tRNA * UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGAC * UCCCCGUAUCGGAG * * In the first pass, output the sequence names and accessions we find * as '#=GS <seqname> AC' lines in the Pfam alignment to an accession * tmpfile, and output sequence names and descriptions we find as * as '#=GS <seqname> DE' lines in the Pfam alignment to a description * tmpfile. * * In the second pass, rewind all (up to 3) files: <ac_tmpfile>, * <de_tmpfile> and the Pfam alignment file and start reading them * again. As we're reading them, output the accessions, descriptions * and aligned sequence data in the proper order to an aligned FASTA * file. * * Set <ret_reached_eof> as TRUE if the alignment read and reformatted * appears to be the only one remaining in afp. Set <ret_reached_eof> * as FALSE if afp appears to include at least one more alignment. * * Returns void. Dies upon any input error. */ static void regurgitate_pfam_as_afa(ESLX_MSAFILE *afp, FILE *ofp, char *alifile, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, char *rename, char *rfrom, char *rto, int *ret_reached_eof) { char *p = NULL; esl_pos_t n = 0; esl_pos_t gslen, seqnamelen, taglen; char *seqname = NULL; char *first_seqname = NULL; char *tag = NULL; char *gs = NULL; int nseq_read = 0; int reached_eof; /* variables related to reading accessions */ char ac_tmpfile[16] = "esltmpXXXXXX"; FILE *ac_fp = NULL; /* file ptr for accession tmpfile */ char *ac_buf = NULL; /* buffer for line input w/ sre_fgets() */ int ac_buflen = 0; /* current allocated length for buf */ char *ac_s = NULL; char *ac_seqname = NULL; char *ac = NULL; int have_ac = FALSE; /* variables related to reading descriptions */ char de_tmpfile[16] = "esltmpXXXXXX"; FILE *de_fp = NULL; /* file ptr for description tmpfile */ char *de_buf = NULL; /* buffer for line input w/ sre_fgets() */ int de_buflen = 0; /* current allocated length for buf */ char *de_s = NULL; char *de_seqname = NULL; char *de = NULL; int have_de = FALSE; /* variables related to printing out sequences */ char *aseq = NULL; esl_pos_t aseqlen = 0; int64_t apos; char aseqbuf[61]; int cpl = 60; /* number of residues per afa seq line */ int acpl; /* actual number of character per line */ int status; afp->errmsg[0] = '\0'; /************************************************************************************************** * First pass, go through each line of the Pfam file and output all GS DE and AC annotation to tmpfiles **************************************************************************************************/ /* Check the magic Stockholm header line, allowing blank lines */ do { status = eslx_msafile_GetLine(afp, &p, &n); if (status == eslEOF) return; else if (status != eslOK) esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber); } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* skip blank lines */ (esl_memstrpfx(afp->line, afp->n, "#") /* and skip comment lines */ && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM"))); /* but stop on Stockholm header */ if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber); while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (esl_memstrpfx(p, n, "#=GS")) { /* only lines we need to check are AC and DE lines, we don't even check other lines for validity */ if (esl_memtok(&p, &n, " \t", &gs, &gslen) != eslOK) esl_fatal("small mem parse failed (line %d) in a way that can't happen", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <seqname>, <tag>, annotation", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <tag>, annotation", (int) afp->linenumber); if (! esl_memstrcmp(gs, gslen, "#=GS")) esl_fatal("small mem parse failed (line %d): faux #=GS line?", (int) afp->linenumber); if (esl_memstrcmp(tag, taglen, "AC")) { if (! ac_fp && esl_tmpfile(ac_tmpfile, &ac_fp) != eslOK) esl_fatal("small mem parse failed, unable to open accession tmpfile"); fprintf(ac_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p); } if (esl_memstrcmp(tag, taglen, "DE")) { if (! de_fp && esl_tmpfile(de_tmpfile, &de_fp) != eslOK) esl_fatal("small mem parse failed, unable to open description tmpfile"); fprintf(de_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p); } } else if (esl_memstrpfx(p, n, "//")) break; } if (status == eslEOF) esl_fatal("small mem parse failed (line %d): missing // terminator", (int) afp->linenumber); else if (status != eslOK) esl_fatal("small mem parse failed (line %d) with code %d", (int) afp->linenumber, status); /* The regurgitate_*() functions are limited, and only deal with single-record Pfam files. * If there appears to be more data in the file, drop the reached_eof flag. */ while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (esl_memstrpfx(p, n, "# STOCKHOLM 1.")) break; if (n && ! esl_memstrpfx(p, n, "#")) esl_fatal("small mem parse failed (line %d): unexpected data", (int) afp->linenumber); } if (status == eslOK) reached_eof = FALSE; else if (status == eslEOF) reached_eof = TRUE; else esl_fatal("--small parse error. problem reading line %d of msafile", (int) afp->linenumber); /***************************************************************** * Pass 1 complete; rewind (close/reopen) all files *****************************************************************/ eslx_msafile_Close(afp); if ((status = eslx_msafile_Open(NULL, alifile, NULL, eslMSAFILE_PFAM, NULL, &afp)) != eslOK) esl_fatal("--small, second pass, unable to open file %s for reading", alifile); if (ac_fp) { /* open the tmpfile with the seq accessions */ rewind(ac_fp); if((status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp)) != eslOK) esl_fatal("--small accession tmpfile parse failed"); ac_s = ac_buf; if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); if (esl_strtok_adv(&ac_s, "\n\r", &ac, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); } if (de_fp) { /* open the tmpfile with the seq descriptions */ rewind(de_fp); if((status = esl_fgets(&(de_buf), &(de_buflen), de_fp)) != eslOK) esl_fatal("--small description tmpfile parse failed"); de_s = de_buf; if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); if (esl_strtok_adv(&de_s, "\n\r", &de, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); } /****************************************************************************************** * Pass 2, step through files, outputting appropriately ******************************************************************************************/ do { status = eslx_msafile_GetLine(afp, &p, &n); if (status == eslEOF) return; else if (status != eslOK) esl_fatal("small mem parse pass 2 error. problem reading line %d of msafile", (int) afp->linenumber); } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* skip blank lines */ (esl_memstrpfx(afp->line, afp->n, "#") /* and skip comment lines */ && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM"))); /* but stop on Stockholm header */ if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse pass 2 failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber); while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (!n || *p == '#') continue; /* skip blank lines, comments */ else if (esl_memstrpfx(p, n, "//")) break; /* end of alignment: end of record */ else { /* sequence line. parse line into temporary strings */ if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no seq name", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &aseq, &aseqlen) != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no aseq", (int) afp->linenumber); /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */ if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, seqnamelen, &(first_seqname))) != eslOK) esl_fatal("small mem parse failed: unable to copy seqname"); } else if (esl_memstrcmp(seqname, seqnamelen, first_seqname)) esl_fatal("--small parse pass 2 failed (line %d): two seqs named %s. Alignment appears to be in interleaved Stockholm (not Pfam) format.", (int) afp->linenumber, seqname); nseq_read++; /* determine if we have an accession and/or description for this sequence */ have_de = have_ac = FALSE; if (ac_seqname && (esl_memstrcmp(seqname, seqnamelen, ac_seqname))) have_ac = TRUE; if (de_seqname && (esl_memstrcmp(seqname, seqnamelen, de_seqname))) have_de = TRUE; if (rename) fprintf(ofp, ">%s.%d%s%s%s%s\n", rename, nseq_read, (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); else fprintf(ofp, ">%.*s%s%s%s%s\n", (int) seqnamelen, seqname, (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); /* load next ac, de */ if (have_ac) { status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp); if (status == eslEOF) ac_seqname = NULL; else if (status == eslOK) { ac_s = ac_buf; if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); if (esl_strtok_adv(&ac_s, "\n\r", &ac, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); } } if (have_de) { status = esl_fgets(&(de_buf), &(de_buflen), de_fp); if(status == eslEOF) de_seqname = NULL; else if (status == eslOK) { de_s = de_buf; if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); if (esl_strtok_adv(&de_s, "\n\r", &de, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); } } /* now print sequence, after converting symbols as nec */ /* remember, aseq itself is part of an ESL_BUFFER and you can't write to it, so symconverts have to be on the copy */ for (apos = 0; apos < aseqlen; apos += cpl) { acpl = (aseqlen - apos > cpl ? cpl : aseqlen - apos); strncpy(aseqbuf, aseq + apos, acpl); aseqbuf[acpl] = '\0'; if (rfrom) symconvert(aseqbuf, rfrom, rto); if (gapsym) symconvert(aseqbuf, "-_.", gapsym); if (force_lower) symconvert(aseqbuf, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"); if (force_upper) symconvert(aseqbuf, "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); if (force_rna) symconvert(aseqbuf, "Tt", "Uu"); if (force_dna) symconvert(aseqbuf, "Uu", "Tt"); if (iupac_to_n) symconvert(aseqbuf, "RYMKSWHBVDrymkswhbvd", "NNNNNNNNNNnnnnnnnnnn"); if (x_is_bad) symconvert(aseqbuf, "Xx", "Nn"); fprintf(ofp, "%s\n", aseqbuf); } } } /* If we saw a normal // end, we would've successfully read a line, * so when we get here, status (from the line read) should be eslOK. */ if (status != eslOK) esl_fatal("--small parse pass 2 failed (line %d): didn't find // at end of alignment", (int) afp->linenumber); if (ac_seqname) esl_fatal("--small parse pass 2 failed, sequence %s with #=GS AC line does not exist in alignment or is in different order.", ac_seqname); if (de_seqname) esl_fatal("--small parse pass 2 failed, sequence %s with #=GS DE line does not exist in alignment or is in different order.", de_seqname); if (ac_fp) fclose(ac_fp); if (de_fp) fclose(de_fp); eslx_msafile_Close(afp); if (first_seqname) free(first_seqname); if (ac_buf) free(ac_buf); if (de_buf) free(de_buf); *ret_reached_eof = reached_eof; return; }
/* Function: esl_msafile_a2m_Read() * Synopsis: Read a UCSC A2M format alignment. * * Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>, parsing * for UCSC A2M (SAM) format. Create a new MSA, * and return a ptr to it in <*ret_msa>. Caller is responsible * for freeing this <ESL_MSA>. * * The <msa> has a reference line (<msa->rf[]>) that * corresponds to the uppercase/lowercase columns in the * alignment: consensus (uppercase) columns are marked 'X', * and insert (lowercase) columns are marked '.' in the RF * annotation line. * * This input parser can deal both with "dotless" A2M, and * full A2M format with dots. * * Args: afp - open <ESL_MSAFILE> * ret_msa - RETURN: newly parsed <ESL_MSA> * * Returns: <eslOK> on success. <*ret_msa> is set to the newly * allocated MSA, and <afp> is at EOF. * * <eslEOF> if no (more) alignment data are found in * <afp>, and <afp> is returned at EOF. * * <eslEFORMAT> on a parse error. <*ret_msa> is set to * <NULL>. <afp> contains information sufficient for * constructing useful diagnostic output: * | <afp->errmsg> | user-directed error message | * | <afp->linenumber> | line # where error was detected | * | <afp->line> | offending line (not NUL-term) | * | <afp->n> | length of offending line | * | <afp->bf->filename> | name of the file | * and <afp> is poised at the start of the following line, * so (in principle) the caller could try to resume * parsing. * * Throws: <eslEMEM> - an allocation failed. * <eslESYS> - a system call such as fread() failed * <eslEINCONCEIVABLE> - "impossible" corruption * On these, <*ret_msa> is returned <NULL>, and the state of * <afp> is undefined. */ int esl_msafile_a2m_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa) { ESL_MSA *msa = NULL; char **csflag = NULL; /* csflag[i][pos] is TRUE if aseq[i][pos] was uppercase consensus */ int *nins = NULL; /* # of inserted residues before each consensus col [0..ncons-1] */ int *this_nins = NULL; /* # of inserted residues before each consensus residue in this seq */ int nseq = 0; int ncons = 0; int idx; int64_t thislen; int64_t spos; int this_ncons; int cpos, bpos; char *p, *tok; esl_pos_t n, toklen; int status; ESL_DASSERT1( (afp->format == eslMSAFILE_A2M) ); afp->errmsg[0] = '\0'; #ifdef eslAUGMENT_ALPHABET if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } #endif if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } ESL_ALLOC(csflag, sizeof(char *) * msa->sqalloc); for (idx = 0; idx < msa->sqalloc; idx++) csflag[idx] = NULL; /* skip leading blank lines in file */ while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ; if (status != eslOK) goto ERROR; /* includes normal EOF */ /* tolerate sloppy space at start of name/desc line */ while (n && isspace(*p)) { p++; n--; } if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected A2M name/desc line starting with >"); do { /* for each record starting in '>': */ p++; n--; /* advance past > */ if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for A2M record"); if (nseq >= msa->sqalloc) { int old_sqalloc = msa->sqalloc; if ( (status = esl_msa_Expand(msa)) != eslOK) goto ERROR; ESL_REALLOC(csflag, sizeof(char *) * msa->sqalloc); for (idx = old_sqalloc; idx < msa->sqalloc; idx++) csflag[idx] = NULL; } if ( (status = esl_msa_SetSeqName (msa, nseq, tok, toklen)) != eslOK) goto ERROR; if (n && (status = esl_msa_SetSeqDescription(msa, nseq, p, n)) != eslOK) goto ERROR; /* now for each sequence line... */ thislen = 0; /* count of lowercase, uppercase, and '-': w/o dots, on first pass */ this_ncons = 0; /* count of uppercase + '-': number of consensus columns in alignment: must match for all seqs */ if (nseq) { for (cpos = 0; cpos <= ncons; cpos++) // A little tricksy. <this_nins> is allocated on first seq, when nseq=0. this_nins[cpos] = 0; // cppcheck gets confused and erroneously calls "possible null pointer deference"; ignore it. } while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */ if (n == 0) continue; /* tolerate and skip blank lines */ if (*p == '>') break; ESL_REALLOC(csflag[nseq], sizeof(char) * (thislen + n + 1)); /* might be an overalloc by a bit, depending on whitespace on line */ if (nseq == 0) { ESL_REALLOC(this_nins, sizeof(int) * (this_ncons + n + 1)); for (cpos = this_ncons; cpos <= this_ncons+n; cpos++) this_nins[cpos] = 0; } for (spos = thislen, bpos = 0; bpos < n; bpos++) { if (p[bpos] == 'O') continue; else if (isupper(p[bpos])) { csflag[nseq][spos++] = TRUE; this_ncons++; } else if (islower(p[bpos])) { csflag[nseq][spos++] = FALSE; this_nins[this_ncons]++; } else if (p[bpos] == '-') { csflag[nseq][spos++] = TRUE; this_ncons++; } if (ncons && this_ncons > ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)"); } csflag[nseq][spos] = TRUE; /* need a sentinel, because of the way the padding functions work */ #ifdef eslAUGMENT_ALPHABET if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[nseq]), &thislen, p, n); } #endif if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[nseq]), &thislen, p, n); } if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters"); else if (status != eslOK) goto ERROR; ESL_DASSERT1( (spos == thislen) ); } if (status != eslOK && status != eslEOF) goto ERROR; /* exception thrown by esl_msafile_GetLine() */ /* status == OK: then *p == '>'. status == eslEOF: we're eof. status == anything else: error */ /* Finished reading a sequence record. */ if (nseq == 0) { ncons = this_ncons; ESL_ALLOC(nins, sizeof(int) * (ncons+1)); for (cpos = 0; cpos <= ncons; cpos++) nins[cpos] = this_nins[cpos]; } else { if (this_ncons != ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)"); for (cpos = 0; cpos <= ncons; cpos++) nins[cpos] = ESL_MAX(nins[cpos], this_nins[cpos]); } nseq++; } while (status == eslOK); /* Now we have nseq *unaligned* sequences in ax/aseq[0..nseq-1]; call the length slen, though we don't explicitly store it * csflag[idx][spos] tells us whether each unaligned residue is an insertion or consensus, for spos==0..slen-1. * nins[0..ncons] tells us the max number of inserted residues before each consensus column * This is sufficient information to reconstruct each aligned sequence. */ msa->nseq = nseq; #ifdef eslAUGMENT_ALPHABET if (msa->abc) { if ((status = a2m_padding_digital(msa, csflag, nins, ncons)) != eslOK) goto ERROR; } #endif if (!msa->abc) { if ((status = a2m_padding_text (msa, csflag, nins, ncons)) != eslOK) goto ERROR; } if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR; *ret_msa = msa; free(nins); free(this_nins); for (idx = 0; idx < msa->nseq; idx++) free(csflag[idx]); free(csflag); return eslOK; ERROR: if (nins) free(nins); if (this_nins) free(this_nins); if (csflag) { for (idx = 0; idx < msa->nseq; idx++) if (csflag[idx]) free(csflag[idx]); free(csflag); } if (msa) esl_msa_Destroy(msa); return status; }