// Create chrom->read genome hash // `chroms` and `genome` must already be allocated void chrom_hash_load2(seq_file_t **seq_files, size_t num_files, ReadBuffer *chroms, ChromHash *genome) { size_t i; khiter_t k; int hret; seq_load_all_reads(seq_files, num_files, chroms); for(i = 0; i < chroms->len; i++) { seq_read_to_uppercase(&chroms->b[i]); seq_read_truncate_name(&chroms->b[i]); if(strchr(chroms->b[i].name.b,':') != NULL) die("Please remove colons from chromosome names [%s]", chroms->b[i].name.b); k = kh_put(kChromHash, genome, chroms->b[i].name.b, &hret); if(hret == 0) warn("duplicate chromosome (take first only): '%s'", chroms->b[i].name.b); else kh_value(genome, k) = &chroms->b[i]; } }
// Returns num of bases printed size_t sim_reads(seq_file_t *reffile, gzFile out0, gzFile out1, FileList *flist, float err_rate, size_t insert, double insert_stddev, size_t rlen, double depth) { size_t i, chromcap = 16, nchroms, glen = 0, nreads, chr, pos0, pos1, tlen; read_t *chroms; tlen = rlen + (out1 == NULL ? 0 : insert + rlen); chroms = malloc(chromcap * sizeof(read_t)); nchroms = 0; // Load genome printf(" Loaded contigs:"); while(1) { if(nchroms == chromcap) chroms = realloc(chroms, (chromcap*=2)*sizeof(read_t)); seq_read_alloc(&chroms[nchroms]); if(seq_read(reffile, &chroms[nchroms]) <= 0) { seq_read_dealloc(&chroms[nchroms]); break; } if(chroms[nchroms].seq.end < tlen) { seq_read_dealloc(&chroms[nchroms]); } else { seq_read_truncate_name(&chroms[nchroms]); printf(" %s[%zu]", chroms[nchroms].name.b, chroms[nchroms].seq.end); glen += chroms[nchroms].seq.end; nchroms++; } } printf("\n Genome size: %zu\n", glen); if(nchroms == 0) { die("No sequences long enough in ref genome file [min len: %zu]: %s", tlen, reffile->path); } // Sample nreads = (glen * depth) / (out1 == NULL ? rlen : (2 * rlen)); char read0[rlen+1], read1[rlen+1]; read0[rlen] = read1[rlen] = '\0'; printf("Sampling %zu %sreads...\n", nreads, out1 == NULL ? "single " : "paired-end "); // Sample paired-end if out1 != NULL for(i = 0; i < nreads; i++) { chr = (nchroms == 1) ? 0 : rand_chrom(chroms, nchroms, glen); pos0 = random_uniform(chroms[chr].seq.end - (out1 == NULL ? rlen : tlen)); pos1 = pos0; memcpy(read0, chroms[chr].seq.b+pos0, rlen); if(out1 != NULL) { pos1 = pos0 + rlen + insert + ran_normal()*insert_stddev; if(pos1 + rlen > chroms[chr].seq.end) pos1 = chroms[chr].seq.end-rlen; memcpy(read1, chroms[chr].seq.b+pos1, rlen); } if(flist != NULL) { add_seq_error_profile(read0, rlen, flist); if(out1 != NULL) add_seq_error_profile(read1, rlen, flist); } else if(err_rate >= 0) { add_seq_error_rate(read0, rlen, err_rate); } gzprintf(out0, ">r%zu:%s:%zu:%zu%s\n%.*s\n", i, chroms[chr].name.b, pos0, pos1, (out1 != NULL ? "/1" : ""), (int)rlen, read0); if(out1 != NULL) { dna_revcmp(read1, rlen); gzprintf(out1, ">r%zu:%s:%zu:%zu/2\n%.*s\n", i, chroms[chr].name.b, pos0, pos1, (int)rlen, read1); } } for(i = 0; i < nchroms; i++) seq_read_dealloc(&chroms[i]); free(chroms); size_t num_bases = nreads * rlen; if(out1 != NULL) num_bases *= 2; return num_bases; }