GCBias::GCBias(const char* ref_filename, PosTable& foreground_position_table, pos_t median_frag_len, sequencing_bias* seqbias[2], const char* task_name) { faidx_t* ref_file = fai_load(ref_filename); if (!ref_file) { Logger::abort("Can't open fasta file '%s'.", ref_filename); } std::vector<ReadPos> foreground_positions; const size_t max_dump = 10000000; foreground_position_table.dump(foreground_positions, max_dump); std::sort(foreground_positions.begin(), foreground_positions.end(), ReadPosSeqnameCmp()); Logger::push_task(task_name, foreground_positions.size()); LoggerTask& task = Logger::get_task(task_name); typedef std::pair<float, float> WeightedGC; std::vector<WeightedGC> foreground_gc, background_gc; int seqlen = 0; SeqName curr_seqname; char* seq = NULL; twobitseq tbseq; twobitseq tbseqrc; rng_t rng; pos_t L = seqbias[0] ? seqbias[0]->getL() : 0; std::vector<ReadPos>::iterator i; for (i = foreground_positions.begin(); i != foreground_positions.end(); ++i) { if (i->seqname != curr_seqname) { free(seq); seq = faidx_fetch_seq(ref_file, i->seqname.get().c_str(), 0, INT_MAX, &seqlen); Logger::debug("read sequence %s.", i->seqname.get().c_str()); if (seq == NULL) { Logger::warn("warning: reference sequence not found, skipping."); } else { for (char* c = seq; *c; c++) *c = tolower(*c); tbseq = seq; tbseqrc = tbseq; tbseqrc.revcomp(); } curr_seqname = i->seqname; } if (seq == NULL || (pos_t) tbseq.size() < median_frag_len) continue; // fragments with many copies tend to have too much weight when training // leading to somewhat less than stable results. if (i->count > 4) continue; // sample background position boost::random::uniform_int_distribution<pos_t> random_uniform( i->start + L, i->end - median_frag_len); pos_t pos = random_uniform(rng); float gc = (float) gc_count(seq + pos, median_frag_len) / median_frag_len; float sb = seqbias[0] ? seqbias[0]->get_bias(tbseq, pos - L) * seqbias[1]->get_bias(tbseqrc, seqlen - pos - 1 - L) : 1.0; background_gc.push_back(WeightedGC(gc, 1.0 / sb)); // sample foreground position if (i->strand == 0) { if (i->pos >= i->start && i->pos + median_frag_len - 1 <= i->end) { float sb = seqbias[0] ? seqbias[0]->get_bias(tbseq, i->pos - L) * seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - 1 - L) : 1.0; foreground_gc.push_back( WeightedGC((float) gc_count(seq + i->pos, median_frag_len) / median_frag_len, 1.0 / sb)); } } else { if (i->pos - median_frag_len >= i->start && i->pos <= i->end) { float sb = seqbias[0] ? seqbias[0]->get_bias(tbseq, i->pos - median_frag_len - L) * seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - median_frag_len - 1 - L) : 1.0; foreground_gc.push_back( WeightedGC((float) gc_count(seq + i->pos - median_frag_len, median_frag_len) / median_frag_len, 1.0 /sb)); } } task.inc(); } free(seq); fai_destroy(ref_file); #if 0 FILE* out = fopen("gcbias.tsv", "w"); fprintf(out, "group\tgc\tweight\n"); BOOST_FOREACH (WeightedGC& value, foreground_gc) { fprintf(out, "foreground\t%f\t%f\n", (double) value.first, (double) value.second); }
void sequencing_bias::build(const char* ref_fn, PosTable& T, size_t max_reads, pos_t L, pos_t R, const char* task_name, double complexity_penalty) { Logger::push_task(task_name); clear(); const size_t min_positions = 1000; if (T.size() < min_positions) return; this->ref_fn = ref_fn; this->L = L; this->R = R; const size_t max_dump = 10000000; std::vector<ReadPos> S; S.reserve(max_dump); T.dump(S, max_dump); /* sort by tid (so we can load one chromosome at a time) */ random_shuffle(S.begin(), S.end()); sort(S.begin(), S.end(), ReadPosSeqnameCmp()); //sort(S.begin(), S.end(), ReadPosCountCmp()); //sort(S.begin(), S.begin() + max_reads, ReadPosSeqnameCmp()); /* sample foreground and background kmer frequencies */ ref_f = fai_load(ref_fn); if (ref_f == NULL) { Logger::abort("Can't open fasta file '%s'.", ref_fn); } std::deque<twobitseq*> foreground_seqs; std::deque<twobitseq*> background_seqs; /* background sampling */ int bg_samples = 1; // make this many samples for each read int bg_sample_num; // keep track of the number of samples made pos_t bg_pos; int seqlen = 0; SeqName curr_seqname; char* seq = NULL; char* local_seq; local_seq = new char[ L + R + 2 ]; local_seq[L+R+1] = '\0'; std::vector<ReadPos>::iterator i; for (i = S.begin(); i != S.end() && i != S.begin() + max_reads; ++i) { /* Load/switch sequences (chromosomes) as they are encountered in the * read stream. The idea here is to avoid thrashing by loading a large * sequence, but also avoid overloading memory by only loading one * chromosome at a time. */ if (i->seqname != curr_seqname) { if (seq) free(seq); seq = faidx_fetch_seq(ref_f, i->seqname.get().c_str(), 0, INT_MAX, &seqlen); Logger::debug("read sequence %s.", i->seqname.get().c_str()); if (seq == NULL) { Logger::warn("warning: reference sequence not found, skipping."); } else { for (char* c = seq; *c; c++) *c = tolower(*c); } curr_seqname = i->seqname; } if (seq == NULL) continue; /* add a foreground sequence */ if (i->strand == strand_neg) { if (i->pos < R || i->pos >= seqlen - L) continue; memcpy(local_seq, seq + i->pos - R, (L+1+R)*sizeof(char)); seqrc(local_seq, L+1+R); } else { if (i->pos < L || i->pos >= seqlen - R) continue; memcpy(local_seq, seq + (i->pos - L), (L+1+R)*sizeof(char)); } if (strchr(local_seq, 'n') != NULL) continue; foreground_seqs.push_back(new twobitseq(local_seq)); /* add a background sequence */ /* adjust the current read position randomly, and sample */ for (bg_sample_num = 0; bg_sample_num < bg_samples;) { random_uniform_int.param( boost::random::uniform_int_distribution<pos_t>::param_type(i->start, i->end)); bg_pos = random_uniform_int(rng); if (i->strand == strand_neg) { if (bg_pos < R || bg_pos >= seqlen - L) continue; memcpy(local_seq, seq + bg_pos - R, (L+1+R)*sizeof(char)); seqrc(local_seq, L+1+R); } else { if (bg_pos < L || bg_pos >= seqlen - R) continue; memcpy(local_seq, seq + (bg_pos-L), (L+1+R)*sizeof(char)); } if (strchr(local_seq, 'n') != NULL) continue; background_seqs.push_back(new twobitseq(local_seq)); bg_sample_num++; } } size_t max_parents = 4; size_t max_distance = 10; /* A bit of a hack: if we are training on very few reads (a couple thousand, * as a opposed to tens of thousands), we tend to end up with too sparse of * a model. */ if (foreground_seqs.size() < 10000) complexity_penalty = 0.25; M = new motif(background_seqs, foreground_seqs, L + 1 + R, max_parents, max_distance, complexity_penalty, task_name); std::deque<twobitseq*>::iterator seqit; for (seqit = background_seqs.begin(); seqit != background_seqs.end(); seqit++) { delete *seqit; } for (seqit = foreground_seqs.begin(); seqit != foreground_seqs.end(); seqit++) { delete *seqit; } free(seq); delete [] local_seq; Logger::pop_task(task_name); }