/** * Reads past the next ambiguous or unambiguous stretch of sequence * from the given FASTA file and returns its length. Does not do * anything with the sequence characters themselves; this is purely for * measuring lengths. */ RefRecord fastaRefReadSize(FileBuf& in, const RefReadInParams& rparms, bool first, BitpairOutFileBuf* bpout) { int c; static int lastc = '>'; // last character seen // RefRecord params size_t len = 0; // 'len' counts toward total length // 'off' counts number of ambiguous characters before first // unambiguous character size_t off = 0; // Pick off the first carat and any preceding whitespace if(first) { assert(!in.eof()); lastc = '>'; c = in.getPastWhitespace(); if(in.eof()) { // Got eof right away; emit warning cerr << "Warning: Empty input file" << endl; lastc = -1; return RefRecord(0, 0, true); } assert(c == '>'); } first = true; // Skip to the end of the id line; if the next line is either // another id line or a comment line, keep skipping if(lastc == '>') { // Skip to the end of the name line do { if((c = in.getPastNewline()) == -1) { // No more input cerr << "Warning: Encountered empty reference sequence" << endl; lastc = -1; return RefRecord(0, 0, true); } if(c == '>') { cerr << "Warning: Encountered empty reference sequence" << endl; } // continue until a non-name, non-comment line } while (c == '>'); } else { first = false; // not the first in a sequence off = 1; // The gap has already been consumed, so count it if((c = in.get()) == -1) { // Don't emit a warning, since this might legitimately be // a gap on the end of the final sequence in the file lastc = -1; return RefRecord(off, len, first); } } // Now skip to the first DNA character, counting gap characters // as we go int lc = -1; // last-DNA char variable for color conversion while(true) { int cat = dna4Cat[c]; if(rparms.nsToAs && cat == 2) c = 'A'; if(cat == 1) { // This is a DNA character if(rparms.color) { if(lc != -1) { // Got two consecutive unambiguous DNAs break; // to read-in loop } // Keep going; we need two consecutive unambiguous DNAs lc = charToDna5[(int)c]; // The 'if(off > 0)' takes care of the case where // the reference is entirely unambiguous and we don't // want to incorrectly increment off. if(off > 0) off++; } else { break; // to read-in loop } } else if(cat == 2) { if(lc != -1 && off == 0) off++; lc = -1; off++; // skip over gap character and increment } else if(c == '>') { if(off > 0 && lastc == '>') { cerr << "Warning: Encountered reference sequence with only gaps" << endl; } else if(lastc == '>') { cerr << "Warning: Encountered empty reference sequence" << endl; } lastc = '>'; return RefRecord(off, 0, first); } c = in.get(); if(c == -1) { // End-of-file if(off > 0 && lastc == '>') { cerr << "Warning: Encountered reference sequence with only gaps" << endl; } else if(lastc == '>') { cerr << "Warning: Encountered empty reference sequence" << endl; } lastc = -1; return RefRecord(off, 0, first); } } assert(!rparms.color || (lc != -1)); assert_eq(1, dna4Cat[c]); // C must be unambiguous base if(off > 0 && rparms.color && first) { // Handle the case where the first record has ambiguous // characters but we're in color space; one of those counts is // spurious off--; } // in now points just past the first character of a sequence // line, and c holds the first character while(c != -1 && c != '>') { if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A'; uint8_t cat = dna4Cat[c]; int cc = toupper(c); if(rparms.bisulfite && cc == 'C') c = cc = 'T'; if(cat == 1) { // It's a DNA character assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T'); // Consume it len++; // Output it if(bpout != NULL) { if(rparms.color) { // output color bpout->write(dinuc2color[charToDna5[(int)c]][lc]); } else if(!rparms.color) { // output nucleotide bpout->write(charToDna5[c]); } } lc = charToDna5[(int)c]; } else if(cat == 2) { // It's an N or a gap lastc = c; assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T'); return RefRecord(off, len, first); } else { // Not DNA and not a gap, ignore it #ifndef NDEBUG if(!isspace(c)) { cerr << "Unexpected character in sequence: "; if(isprint(c)) { cerr << ((char)c) << endl; } else { cerr << "(" << c << ")" << endl; } } #endif } c = in.get(); } lastc = c; return RefRecord(off, len, first); }
/** * Parse a single quality string from fb and store qualities in r. * Assume the next character obtained via fb.get() is the first * character of the quality string. When returning, the next * character returned by fb.peek() or fb.get() should be the first * character of the following line. */ int parseQuals( Read& r, FileBuf& fb, int firstc, int readLen, int trim3, int trim5, bool intQuals, bool phred64, bool solexa64) { int c = firstc; assert(c != '\n' && c != '\r'); r.qual.clear(); if (intQuals) { while (c != '\r' && c != '\n' && c != -1) { bool neg = false; int num = 0; while(!isspace(c) && !fb.eof()) { if(c == '-') { neg = true; assert_eq(num, 0); } else { if(!isdigit(c)) { char buf[2048]; cerr << "Warning: could not parse quality line:" << endl; fb.getPastNewline(); cerr << fb.copyLastN(buf); buf[2047] = '\0'; cerr << buf; throw 1; } assert(isdigit(c)); num *= 10; num += (c - '0'); } c = fb.get(); } if(neg) num = 0; // Phred-33 ASCII encode it and add it to the back of the // quality string r.qual.append('!' + num); // Skip over next stretch of whitespace while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) { c = fb.get(); } } } else { while (c != '\r' && c != '\n' && c != -1) { r.qual.append(charToPhred33(c, solexa64, phred64)); c = fb.get(); while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) { c = fb.get(); } } } if ((int)r.qual.length() < readLen-1 || ((int)r.qual.length() < readLen && !r.color)) { tooFewQualities(r.name); } r.qual.trimEnd(trim3); if(r.qual.length()-trim5 < r.patFw.length()) { assert(gColor && r.primer != -1); assert_gt(trim5, 0); trim5--; } r.qual.trimBegin(trim5); if(r.qual.length() <= 0) return 0; assert_eq(r.qual.length(), r.patFw.length()); while(fb.peek() == '\n' || fb.peek() == '\r') fb.get(); return (int)r.qual.length(); }