int load_seqs(const char *path, char ***seqs_ptr, int *cap_ptr) { int cap = 1024; char **seqs = my_malloc(sizeof(char*) * cap,__FILE__,__LINE__); read_t read; seq_read_alloc(&read); seq_file_t *file = seq_open(path); if(file == NULL) die("Cannot open file: %s.", path); int num = 0; while(seq_read(file, &read)) { if(num == cap) { cap *= 2; seqs = realloc(seqs, sizeof(char*) * cap); } seqs[num++] = strdup(read.seq.b); } seq_read_dealloc(&read); seq_close(file); *seqs_ptr = seqs; *cap_ptr = cap; return num; }
// Load all reads from files into a read buffer and close the seq_files // Returns the number of reads loaded size_t seq_load_all_reads(seq_file_t **seq_files, size_t num_files, ReadBuffer *rbuf) { status("Loading sequences..."); size_t i, nreads = rbuf->len; read_t r; seq_read_alloc(&r); for(i = 0; i < num_files; i++) { status(" file: %s", seq_files[i]->path); while(seq_read_primary(seq_files[i], &r) > 0) { read_buf_push(rbuf, &r, 1); // copy read seq_read_alloc(&r); // allocate new read } seq_close(seq_files[i]); } seq_read_dealloc(&r); return rbuf->len - nreads; }
int main(int argc, char **argv) { if(argc != 2) exit(EXIT_FAILURE); seq_file_t *f = seq_open(argv[1]); read_t *r = seq_read_alloc(); if(f == NULL) exit(EXIT_FAILURE); while(seq_read(f,r) > 0) printf("%s\t[%lu,%lu,%lu]\n", r->name.b, r->name.end, r->seq.end, r->qual.end); seq_close(f); seq_read_destroy(r); return EXIT_SUCCESS; }
// Load reads from a file, apply sequence error, dump // Return total number of bases size_t mutate_reads(seq_file_t *sfile, gzFile gzout, FileList *flist, float err) { printf(" reading: %s\n", sfile->path); read_t r; seq_read_alloc(&r); size_t num_bases = 0; while(seq_read(sfile, &r) > 0) { if(err > 0) add_seq_error_rate(r.seq.b, r.seq.end, err); else add_seq_error_profile(r.seq.b, r.seq.end, flist); gzprintf(gzout, "@%s\n%s\n+\n%s\n", r.name.b, r.seq.b, r.qual.b); num_bases += r.seq.end; } seq_read_dealloc(&r); return num_bases; }
void filelist_alloc(FileList *flist, char **paths, size_t num) { size_t i; flist->num_files = num; flist->curr = 0; flist->files = malloc(num * sizeof(seq_file_t*)); flist->fqoffsets = malloc(num * sizeof(int)); for(i = 0; i < num; i++) { if((flist->files[i] = seq_open(paths[i])) == NULL) die("Cannot open: %s", paths[i]); int min, max, fmt; fmt = seq_guess_fastq_format(flist->files[i], &min, &max); if(fmt < 0) die("Cannot detect FASTQ format: %s", paths[i]); flist->fqoffsets[i] = FASTQ_OFFSET[fmt]; printf(" profile: %s [offset: %i]\n", paths[i], FASTQ_OFFSET[fmt]); } seq_read_alloc(&flist->read); flist->filesready = 1; flist->errors_cap = 512; flist->errors_len = 0; flist->errors = calloc(flist->errors_cap, sizeof(size_t)); }
static void test_kmer_occur_filter() { // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; size_t i; // Create graph db_graph_alloc(&graph, kmer_size, ncols, 1, 2000, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // xyz------->>> y > < X // TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA #define NUM_NODES 3 #define NUM_READS 3 const char *tmp[NUM_READS] = { "AACA", "TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA", "TCTAGCATGTGTGTT"}; read_t reads[NUM_READS]; for(i = 0; i < NUM_READS; i++) { seq_read_alloc(&reads[i]); seq_read_set(&reads[i], tmp[i]); } KOGraph kograph = kograph_create(reads, NUM_READS, true, 0, 1, &graph); TASSERT(kograph.nchroms == NUM_READS); TASSERT(kograph.koccurs != NULL); KOccurRunBuffer koruns, koruns_tmp, koruns_ended; korun_buf_alloc(&koruns, 16); korun_buf_alloc(&koruns_tmp, 16); korun_buf_alloc(&koruns_ended, 16); // Check CCCGACAGGGCAA starts at CCCGACAGGGC // x=CCCGACAGGGC, y=CCGACAGGGCA, z=CGACAGGGCAA // X=GCCCTGTCGGG, Y=TGCCCTGTCGG, Z=TTGCCCTGTCG dBNode nodes[NUM_NODES]; for(i = 0; i < NUM_NODES; i++) nodes[i] = db_graph_find_str(&graph, &"CCCGACAGGGCAA"[i]); korun_buf_reset(&koruns); korun_buf_reset(&koruns_ended); kograph_filter_extend(&kograph, nodes, NUM_NODES, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended); // Checks TASSERT2(koruns.len == 1, "koruns.len: %zu", koruns.len); TASSERT(koruns.b[0].strand == STRAND_PLUS); // left-to-right with ref TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom); TASSERT2(koruns.b[0].first == 5, "offset: %zu", (size_t)koruns.b[0].first); TASSERT2(koruns.b[0].last == 7, "last: %zu", (size_t)koruns.b[0].last); // Test reverse db_nodes_reverse_complement(nodes, NUM_NODES); korun_buf_reset(&koruns); korun_buf_reset(&koruns_ended); kograph_filter_extend(&kograph, nodes, 1, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended); kograph_filter_extend(&kograph, nodes+1, 1, true, 0, 1, &koruns, &koruns_tmp, &koruns_ended); kograph_filter_extend(&kograph, nodes+2, 1, true, 0, 2, &koruns, &koruns_tmp, &koruns_ended); // Print out for debugging // printf("koruns: "); // koruns_print(koruns.b, koruns.len, kmer_size, stdout); // printf("\nkoruns_ended: "); // koruns_print(koruns_ended.b, koruns_ended.len, kmer_size, stdout); // printf("\n"); // Check results match: // koruns: chromid:1:17-5:-, chromid:1:37-47:+ // koruns_ended: chromid:1:34-24:- TASSERT2(koruns.len == 2, "koruns.len: %zu", koruns.len); TASSERT2(koruns_ended.len == 1, "koruns_ended.len: %zu", koruns_ended.len); TASSERT(koruns.b[0].strand == STRAND_MINUS); // reverse complement of ref TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom); TASSERT2(koruns.b[0].first == 7, "offset: %zu", (size_t)koruns.b[0].first); TASSERT2(koruns.b[0].last == 5, "last: %zu", (size_t)koruns.b[0].last); korun_buf_dealloc(&koruns); korun_buf_dealloc(&koruns_tmp); korun_buf_dealloc(&koruns_ended); for(i = 0; i < NUM_READS; i++) seq_read_dealloc(&reads[i]); kograph_dealloc(&kograph); db_graph_dealloc(&graph); }
// If seq2 is NULL, read pair of entries from first file // Otherwise read an entry from each void align_from_file(const char *path1, const char *path2, void (align)(read_t *r1, read_t *r2), bool use_zlib) { seq_file_t *sf1, *sf2; if((sf1 = open_seq_file(path1, use_zlib)) == NULL) { fprintf(stderr, "Alignment Error: couldn't open file %s\n", path1); fflush(stderr); return; } if(path2 == NULL) { sf2 = sf1; } else if((sf2 = open_seq_file(path2, use_zlib)) == NULL) { fprintf(stderr, "Alignment Error: couldn't open file %s\n", path1); fflush(stderr); return; } // fprintf(stderr, "File buffer %zu zlib: %i\n", sf1->in.size, seq_use_gzip(sf1)); read_t read1, read2; seq_read_alloc(&read1); seq_read_alloc(&read2); // Loop while we can read a sequence from the first file unsigned long alignments; for(alignments = 0; seq_read(sf1, &read1) > 0; alignments++) { if(seq_read(sf2, &read2) <= 0) { fprintf(stderr, "Alignment Error: Odd number of sequences - " "I read in pairs!\n"); fflush(stderr); break; } (align)(&read1, &read2); } // warn if no bases read if(alignments == 0) { fprintf(stderr, "Alignment Warning: empty input\n"); fflush(stderr); } // Close files seq_close(sf1); if(path2 != NULL) seq_close(sf2); // Free memory seq_read_dealloc(&read1); seq_read_dealloc(&read2); }
// Returns num of bases printed size_t sim_reads(seq_file_t *reffile, gzFile out0, gzFile out1, FileList *flist, float err_rate, size_t insert, double insert_stddev, size_t rlen, double depth) { size_t i, chromcap = 16, nchroms, glen = 0, nreads, chr, pos0, pos1, tlen; read_t *chroms; tlen = rlen + (out1 == NULL ? 0 : insert + rlen); chroms = malloc(chromcap * sizeof(read_t)); nchroms = 0; // Load genome printf(" Loaded contigs:"); while(1) { if(nchroms == chromcap) chroms = realloc(chroms, (chromcap*=2)*sizeof(read_t)); seq_read_alloc(&chroms[nchroms]); if(seq_read(reffile, &chroms[nchroms]) <= 0) { seq_read_dealloc(&chroms[nchroms]); break; } if(chroms[nchroms].seq.end < tlen) { seq_read_dealloc(&chroms[nchroms]); } else { seq_read_truncate_name(&chroms[nchroms]); printf(" %s[%zu]", chroms[nchroms].name.b, chroms[nchroms].seq.end); glen += chroms[nchroms].seq.end; nchroms++; } } printf("\n Genome size: %zu\n", glen); if(nchroms == 0) { die("No sequences long enough in ref genome file [min len: %zu]: %s", tlen, reffile->path); } // Sample nreads = (glen * depth) / (out1 == NULL ? rlen : (2 * rlen)); char read0[rlen+1], read1[rlen+1]; read0[rlen] = read1[rlen] = '\0'; printf("Sampling %zu %sreads...\n", nreads, out1 == NULL ? "single " : "paired-end "); // Sample paired-end if out1 != NULL for(i = 0; i < nreads; i++) { chr = (nchroms == 1) ? 0 : rand_chrom(chroms, nchroms, glen); pos0 = random_uniform(chroms[chr].seq.end - (out1 == NULL ? rlen : tlen)); pos1 = pos0; memcpy(read0, chroms[chr].seq.b+pos0, rlen); if(out1 != NULL) { pos1 = pos0 + rlen + insert + ran_normal()*insert_stddev; if(pos1 + rlen > chroms[chr].seq.end) pos1 = chroms[chr].seq.end-rlen; memcpy(read1, chroms[chr].seq.b+pos1, rlen); } if(flist != NULL) { add_seq_error_profile(read0, rlen, flist); if(out1 != NULL) add_seq_error_profile(read1, rlen, flist); } else if(err_rate >= 0) { add_seq_error_rate(read0, rlen, err_rate); } gzprintf(out0, ">r%zu:%s:%zu:%zu%s\n%.*s\n", i, chroms[chr].name.b, pos0, pos1, (out1 != NULL ? "/1" : ""), (int)rlen, read0); if(out1 != NULL) { dna_revcmp(read1, rlen); gzprintf(out1, ">r%zu:%s:%zu:%zu/2\n%.*s\n", i, chroms[chr].name.b, pos0, pos1, (int)rlen, read1); } } for(i = 0; i < nchroms; i++) seq_read_dealloc(&chroms[i]); free(chroms); size_t num_bases = nreads * rlen; if(out1 != NULL) num_bases *= 2; return num_bases; }