void filelist_mean_err(FileList *flist) { size_t i, f, maxread = 0, readcap = 512, newcap, carry; long double *sumprob = malloc(readcap * sizeof(long double)); size_t *counts = malloc(readcap * sizeof(size_t)); for(i = 0; i < readcap; i++) { sumprob[i] = 0; counts[i] = 0; } read_t *r = &flist->read; int fmt, fqoffset = 33, minq, maxq; for(f = flist->curr; f < flist->num_files; f++) { fmt = seq_guess_fastq_format(flist->files[f], &minq, &maxq); fqoffset = (fmt == -1 ? 33 : FASTQ_OFFSET[fmt]); while(seq_read(flist->files[f], r) > 0) { if(r->qual.end > readcap) { newcap = ROUNDUP2POW(r->qual.end); sumprob = realloc(sumprob, newcap * sizeof(long double)); counts = realloc(counts, newcap * sizeof(size_t)); for(i = readcap; i < newcap; i++) { sumprob[i] = 0; counts[i] = 0; } readcap = newcap; } counts[r->qual.end-1]++; for(i = 0; i < r->qual.end; i++) sumprob[i] += qual_prob[r->qual.b[i] - fqoffset]; maxread = MAX2(maxread, r->qual.end); } } // Convert counts to cummulative (reverse) for(i = maxread-1, carry = 0; i != SIZE_MAX; i--) { carry += counts[i]; counts[i] = carry; } for(i = 0; i < maxread; i++) { // printf(" %.8Lf/%zu", sumprob[i], counts[i]); printf(" %.2Lf", 100.0 * sumprob[i] / counts[i]); } printf("\n"); free(counts); free(sumprob); }
// Load reads into a buffer and use them to guess the quality score offset // Returns -1 if no quality scores // Defaults to 0 if not recognisable (offset:33, min:33, max:126) static inline int guess_fastq_format(seq_file_t *sf) { // Detect fastq offset int min_qual = INT_MAX, max_qual = INT_MIN; int fmt = seq_guess_fastq_format(sf, &min_qual, &max_qual); // fmt == -1 if no quality scores found if(fmt == -1) { if(seq_is_fastq(sf) || seq_is_sam(sf) || seq_is_bam(sf)) warn("Couldn't find qual scores in %s\n", sf->path); return -1; } status("%s: Qual scores: %s [offset: %i, range: [%i,%i], sample: [%i,%i]]\n", sf->path, FASTQ_FORMATS[fmt], FASTQ_OFFSET[fmt], FASTQ_MIN[fmt], FASTQ_MAX[fmt], min_qual, max_qual); // Test min and max fastq scores int qoffset = FASTQ_OFFSET[fmt], qmax = FASTQ_MAX[fmt]; if(min_qual > qoffset + 20) { warn("Input file has min quality score %i but qoffset is set to %i: %s\n" " Have you predefined an incorrect fastq offset? " "Or is cortex guessing it wrong?", min_qual, qoffset, sf->path); } else if(max_qual > qmax + 20) { warn("Input file has max quality score %i but expected qmax is to %i: %s\n" " Have you predefined an incorrect fastq offset? " "Or is cortex guessing it wrong?", max_qual, qoffset, sf->path); } return fmt; }
void filelist_alloc(FileList *flist, char **paths, size_t num) { size_t i; flist->num_files = num; flist->curr = 0; flist->files = malloc(num * sizeof(seq_file_t*)); flist->fqoffsets = malloc(num * sizeof(int)); for(i = 0; i < num; i++) { if((flist->files[i] = seq_open(paths[i])) == NULL) die("Cannot open: %s", paths[i]); int min, max, fmt; fmt = seq_guess_fastq_format(flist->files[i], &min, &max); if(fmt < 0) die("Cannot detect FASTQ format: %s", paths[i]); flist->fqoffsets[i] = FASTQ_OFFSET[fmt]; printf(" profile: %s [offset: %i]\n", paths[i], FASTQ_OFFSET[fmt]); } seq_read_alloc(&flist->read); flist->filesready = 1; flist->errors_cap = 512; flist->errors_len = 0; flist->errors = calloc(flist->errors_cap, sizeof(size_t)); }