/* Function: esl_recorder_Read() * Synopsis: Read next line of a stream through an <ESL_RECORDER>. * Incept: SRE, Fri Dec 25 16:31:00 2009 [Casa de Gatos] * * Purpose: Read the next line of the input stream that the * <ESL_RECORDER> <rc> is recording. Return a ptr to * it in <*opt_line>. Note that the <ESL_RECORDER> * deals with allocation and freeing of this line; * if caller wants to keep it for something, it must * make a copy immediately, because subsequent calls * to <esl_recorder_*> functions may overwrite these * internal memory buffers. * * Returns: <eslOK> on success. * <eslEOF> if no more lines exist in the stream. * * Throws: <eslEMEM> on an allocation failure. */ int esl_recorder_Read(ESL_RECORDER *rc, char **opt_line) { int idx = (rc->ncurr - rc->baseline) % rc->nalloc; /* index of line to read, in wrapped coords */ int status; /* if currline <= lastline, we already have the line recorded; * else we need to read a new one from <fp> */ if (rc->ncurr >= rc->nread) { /* if reading a new line would overwrite our marked start, grow */ if ( rc->markline >= 0 && ((rc->ncurr - rc->baseline) % rc->nalloc == ((rc->markline - rc->baseline) % rc->nalloc))) { int xtra = ESL_MAX(3, (rc->nalloc / 3)); status = esl_recorder_ResizeTo(rc, rc->nalloc + xtra); if (status) goto ERROR; idx = (rc->ncurr - rc->baseline) % rc->nalloc; } rc->offset[idx] = ftello(rc->fp); status = esl_fgets(&(rc->line[idx]), &(rc->lalloc[idx]), rc->fp); if (status) goto ERROR; rc->nread++; } rc->ncurr++; if (opt_line) *opt_line = rc->line[idx]; return eslOK; ERROR: if (opt_line) *opt_line = NULL; return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RECORDER *rc = NULL; char *filename = esl_opt_GetArg(go, 1); int N = esl_opt_GetInteger(go, "-N"); FILE *fp = NULL; char *buf = NULL; int balloc = 0; int status; if ((fp = fopen(filename, "r")) == NULL) esl_fatal("no such file %s\n", filename); rc = esl_recorder_Create(fp, N); esl_stopwatch_Start(w); while ((status = esl_recorder_Read(rc, &buf)) == eslOK); esl_recorder_Destroy(rc); fclose(fp); esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "recorder time: "); if ((fp = fopen(filename, "r")) == NULL) esl_fatal("no such file %s\n", filename); esl_stopwatch_Start(w); while ((status = esl_fgets(&buf, &balloc, fp)) == eslOK); free(buf); fclose(fp); esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "esl_fgets() time: "); esl_stopwatch_Destroy(w); esl_getopts_Destroy(go); return 0; }
/* regurgitate_pfam_as_afa() * * Given an open Pfam formatted msafile, read the next alignment and * regurgitate it in aligned FASTA (AFA) format without storing * it in a esl_msa data structure. * * We need to do two passes through the file because in Pfam * sequence accessions (#=GS <seqname> AC) and sequence descriptions * (#=GS <seqname> DE) appear altogether before any aligned sequence * data, while in AFA they appear on the same line as the sequence * name (accession, then description). * * Example: * # STOCKHOLM 1.0 * #=GS tRNA1 AC RF00005-1 * #=GS tRNA2 AC RF00005-2 * #=GS tRNA1 DE first tRNA * #=GS tRNA2 DE second tRNA * * tRNA1 GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCA * tRNA2 UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGACUCCCCGUAUCGGAG * * converts to AFA: * >tRNA1 RF00005-1 first tRNA * GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAU * CCACAGAAUUCGCA * >tRNA2 RF00005-2 second tRNA * UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CCGGGGUUCGAC * UCCCCGUAUCGGAG * * In the first pass, output the sequence names and accessions we find * as '#=GS <seqname> AC' lines in the Pfam alignment to an accession * tmpfile, and output sequence names and descriptions we find as * as '#=GS <seqname> DE' lines in the Pfam alignment to a description * tmpfile. * * In the second pass, rewind all (up to 3) files: <ac_tmpfile>, * <de_tmpfile> and the Pfam alignment file and start reading them * again. As we're reading them, output the accessions, descriptions * and aligned sequence data in the proper order to an aligned FASTA * file. * * Set <ret_reached_eof> as TRUE if the alignment read and reformatted * appears to be the only one remaining in afp. Set <ret_reached_eof> * as FALSE if afp appears to include at least one more alignment. * * Returns void. Dies upon any input error. */ static void regurgitate_pfam_as_afa(ESLX_MSAFILE *afp, FILE *ofp, char *alifile, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, char *rename, char *rfrom, char *rto, int *ret_reached_eof) { char *p = NULL; esl_pos_t n = 0; esl_pos_t gslen, seqnamelen, taglen; char *seqname = NULL; char *first_seqname = NULL; char *tag = NULL; char *gs = NULL; int nseq_read = 0; int reached_eof; /* variables related to reading accessions */ char ac_tmpfile[16] = "esltmpXXXXXX"; FILE *ac_fp = NULL; /* file ptr for accession tmpfile */ char *ac_buf = NULL; /* buffer for line input w/ sre_fgets() */ int ac_buflen = 0; /* current allocated length for buf */ char *ac_s = NULL; char *ac_seqname = NULL; char *ac = NULL; int have_ac = FALSE; /* variables related to reading descriptions */ char de_tmpfile[16] = "esltmpXXXXXX"; FILE *de_fp = NULL; /* file ptr for description tmpfile */ char *de_buf = NULL; /* buffer for line input w/ sre_fgets() */ int de_buflen = 0; /* current allocated length for buf */ char *de_s = NULL; char *de_seqname = NULL; char *de = NULL; int have_de = FALSE; /* variables related to printing out sequences */ char *aseq = NULL; esl_pos_t aseqlen = 0; int64_t apos; char aseqbuf[61]; int cpl = 60; /* number of residues per afa seq line */ int acpl; /* actual number of character per line */ int status; afp->errmsg[0] = '\0'; /************************************************************************************************** * First pass, go through each line of the Pfam file and output all GS DE and AC annotation to tmpfiles **************************************************************************************************/ /* Check the magic Stockholm header line, allowing blank lines */ do { status = eslx_msafile_GetLine(afp, &p, &n); if (status == eslEOF) return; else if (status != eslOK) esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber); } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* skip blank lines */ (esl_memstrpfx(afp->line, afp->n, "#") /* and skip comment lines */ && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM"))); /* but stop on Stockholm header */ if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber); while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (esl_memstrpfx(p, n, "#=GS")) { /* only lines we need to check are AC and DE lines, we don't even check other lines for validity */ if (esl_memtok(&p, &n, " \t", &gs, &gslen) != eslOK) esl_fatal("small mem parse failed (line %d) in a way that can't happen", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <seqname>, <tag>, annotation", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &tag, &taglen) != eslOK) esl_fatal("small mem parse failed (line %d): #=GS line missing <tag>, annotation", (int) afp->linenumber); if (! esl_memstrcmp(gs, gslen, "#=GS")) esl_fatal("small mem parse failed (line %d): faux #=GS line?", (int) afp->linenumber); if (esl_memstrcmp(tag, taglen, "AC")) { if (! ac_fp && esl_tmpfile(ac_tmpfile, &ac_fp) != eslOK) esl_fatal("small mem parse failed, unable to open accession tmpfile"); fprintf(ac_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p); } if (esl_memstrcmp(tag, taglen, "DE")) { if (! de_fp && esl_tmpfile(de_tmpfile, &de_fp) != eslOK) esl_fatal("small mem parse failed, unable to open description tmpfile"); fprintf(de_fp, "%.*s %.*s\n", (int) seqnamelen, seqname, (int) n, p); } } else if (esl_memstrpfx(p, n, "//")) break; } if (status == eslEOF) esl_fatal("small mem parse failed (line %d): missing // terminator", (int) afp->linenumber); else if (status != eslOK) esl_fatal("small mem parse failed (line %d) with code %d", (int) afp->linenumber, status); /* The regurgitate_*() functions are limited, and only deal with single-record Pfam files. * If there appears to be more data in the file, drop the reached_eof flag. */ while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (esl_memstrpfx(p, n, "# STOCKHOLM 1.")) break; if (n && ! esl_memstrpfx(p, n, "#")) esl_fatal("small mem parse failed (line %d): unexpected data", (int) afp->linenumber); } if (status == eslOK) reached_eof = FALSE; else if (status == eslEOF) reached_eof = TRUE; else esl_fatal("--small parse error. problem reading line %d of msafile", (int) afp->linenumber); /***************************************************************** * Pass 1 complete; rewind (close/reopen) all files *****************************************************************/ eslx_msafile_Close(afp); if ((status = eslx_msafile_Open(NULL, alifile, NULL, eslMSAFILE_PFAM, NULL, &afp)) != eslOK) esl_fatal("--small, second pass, unable to open file %s for reading", alifile); if (ac_fp) { /* open the tmpfile with the seq accessions */ rewind(ac_fp); if((status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp)) != eslOK) esl_fatal("--small accession tmpfile parse failed"); ac_s = ac_buf; if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); if (esl_strtok_adv(&ac_s, "\n\r", &ac, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); } if (de_fp) { /* open the tmpfile with the seq descriptions */ rewind(de_fp); if((status = esl_fgets(&(de_buf), &(de_buflen), de_fp)) != eslOK) esl_fatal("--small description tmpfile parse failed"); de_s = de_buf; if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); if (esl_strtok_adv(&de_s, "\n\r", &de, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); } /****************************************************************************************** * Pass 2, step through files, outputting appropriately ******************************************************************************************/ do { status = eslx_msafile_GetLine(afp, &p, &n); if (status == eslEOF) return; else if (status != eslOK) esl_fatal("small mem parse pass 2 error. problem reading line %d of msafile", (int) afp->linenumber); } while (esl_memspn(afp->line, afp->n, " \t") == afp->n || /* skip blank lines */ (esl_memstrpfx(afp->line, afp->n, "#") /* and skip comment lines */ && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM"))); /* but stop on Stockholm header */ if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse pass 2 failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber); while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */ if (!n || *p == '#') continue; /* skip blank lines, comments */ else if (esl_memstrpfx(p, n, "//")) break; /* end of alignment: end of record */ else { /* sequence line. parse line into temporary strings */ if (esl_memtok(&p, &n, " \t", &seqname, &seqnamelen) != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no seq name", (int) afp->linenumber); if (esl_memtok(&p, &n, " \t", &aseq, &aseqlen) != eslOK) esl_fatal("small mem parse pass 2 failed (line %d): no aseq", (int) afp->linenumber); /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */ if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, seqnamelen, &(first_seqname))) != eslOK) esl_fatal("small mem parse failed: unable to copy seqname"); } else if (esl_memstrcmp(seqname, seqnamelen, first_seqname)) esl_fatal("--small parse pass 2 failed (line %d): two seqs named %s. Alignment appears to be in interleaved Stockholm (not Pfam) format.", (int) afp->linenumber, seqname); nseq_read++; /* determine if we have an accession and/or description for this sequence */ have_de = have_ac = FALSE; if (ac_seqname && (esl_memstrcmp(seqname, seqnamelen, ac_seqname))) have_ac = TRUE; if (de_seqname && (esl_memstrcmp(seqname, seqnamelen, de_seqname))) have_de = TRUE; if (rename) fprintf(ofp, ">%s.%d%s%s%s%s\n", rename, nseq_read, (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); else fprintf(ofp, ">%.*s%s%s%s%s\n", (int) seqnamelen, seqname, (have_ac ? " " : "") , (have_ac ? ac : ""), (have_de ? " " : "") , (have_de ? de : "")); /* load next ac, de */ if (have_ac) { status = esl_fgets(&(ac_buf), &(ac_buflen), ac_fp); if (status == eslEOF) ac_seqname = NULL; else if (status == eslOK) { ac_s = ac_buf; if (esl_strtok_adv(&ac_s, " \t\n\r", &ac_seqname, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); if (esl_strtok_adv(&ac_s, "\n\r", &ac, NULL, NULL) != eslOK) esl_fatal("--small accession tmpfile parse failed"); } } if (have_de) { status = esl_fgets(&(de_buf), &(de_buflen), de_fp); if(status == eslEOF) de_seqname = NULL; else if (status == eslOK) { de_s = de_buf; if (esl_strtok_adv(&de_s, " \t\n\r", &de_seqname, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); if (esl_strtok_adv(&de_s, "\n\r", &de, NULL, NULL) != eslOK) esl_fatal("--small description tmpfile parse failed"); } } /* now print sequence, after converting symbols as nec */ /* remember, aseq itself is part of an ESL_BUFFER and you can't write to it, so symconverts have to be on the copy */ for (apos = 0; apos < aseqlen; apos += cpl) { acpl = (aseqlen - apos > cpl ? cpl : aseqlen - apos); strncpy(aseqbuf, aseq + apos, acpl); aseqbuf[acpl] = '\0'; if (rfrom) symconvert(aseqbuf, rfrom, rto); if (gapsym) symconvert(aseqbuf, "-_.", gapsym); if (force_lower) symconvert(aseqbuf, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"); if (force_upper) symconvert(aseqbuf, "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); if (force_rna) symconvert(aseqbuf, "Tt", "Uu"); if (force_dna) symconvert(aseqbuf, "Uu", "Tt"); if (iupac_to_n) symconvert(aseqbuf, "RYMKSWHBVDrymkswhbvd", "NNNNNNNNNNnnnnnnnnnn"); if (x_is_bad) symconvert(aseqbuf, "Xx", "Nn"); fprintf(ofp, "%s\n", aseqbuf); } } } /* If we saw a normal // end, we would've successfully read a line, * so when we get here, status (from the line read) should be eslOK. */ if (status != eslOK) esl_fatal("--small parse pass 2 failed (line %d): didn't find // at end of alignment", (int) afp->linenumber); if (ac_seqname) esl_fatal("--small parse pass 2 failed, sequence %s with #=GS AC line does not exist in alignment or is in different order.", ac_seqname); if (de_seqname) esl_fatal("--small parse pass 2 failed, sequence %s with #=GS DE line does not exist in alignment or is in different order.", de_seqname); if (ac_fp) fclose(ac_fp); if (de_fp) fclose(de_fp); eslx_msafile_Close(afp); if (first_seqname) free(first_seqname); if (ac_buf) free(ac_buf); if (de_buf) free(de_buf); *ret_reached_eof = reached_eof; return; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_RANDOMNESS *r = NULL; int nselect = 0; char *filename = NULL; FILE *fp = NULL; char **larr = NULL; char *buf = NULL; int buflen = 0; char *tmp = NULL; int i,j; int n; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); nselect = atoi(esl_opt_GetArg(go, 1)); filename = esl_opt_GetArg(go, 2); r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); if ((larr = malloc(sizeof(char *) * nselect)) == NULL) esl_fatal("allocation failed"); if (strcmp(filename, "-") == 0) fp = stdin; else { if ((fp = fopen(filename, "r")) == NULL) esl_fatal("Failed to open file %s\n", filename); } n = 0; while (esl_fgets(&buf, &buflen, fp) == eslOK) { n++; i = esl_rnd_Roll(r, n); if (i < nselect) { for (j = i; j < nselect && j < n; j++) { tmp = larr[j]; larr[j] = buf; buf = tmp; } free(buf); buf = NULL; buflen = 0; } } for (i = 0; i < nselect; i++) printf("%s", larr[i]); if (fp != stdin) fclose(fp); for (i = 0; i < nselect; i++) free(larr[i]); free(larr); free(buf); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }