Example #1
0
GCBias::GCBias(const char* ref_filename, PosTable& foreground_position_table,
	           pos_t median_frag_len,
               sequencing_bias* seqbias[2],
               const char* task_name)
{
	faidx_t* ref_file = fai_load(ref_filename);
	if (!ref_file) {
        Logger::abort("Can't open fasta file '%s'.", ref_filename);
	}

	std::vector<ReadPos> foreground_positions;
    const size_t max_dump = 10000000;
	foreground_position_table.dump(foreground_positions, max_dump);
	std::sort(foreground_positions.begin(), foreground_positions.end(), ReadPosSeqnameCmp());

	Logger::push_task(task_name, foreground_positions.size());
	LoggerTask& task = Logger::get_task(task_name);

    typedef std::pair<float, float> WeightedGC;

	std::vector<WeightedGC> foreground_gc, background_gc;

    int  seqlen = 0;
    SeqName curr_seqname;
    char* seq = NULL;
    twobitseq tbseq;
    twobitseq tbseqrc;
    rng_t rng;

    pos_t L = seqbias[0] ? seqbias[0]->getL() : 0;

    std::vector<ReadPos>::iterator i;
    for (i = foreground_positions.begin(); i != foreground_positions.end(); ++i) {
        if (i->seqname != curr_seqname) {
            free(seq);
            seq = faidx_fetch_seq(ref_file, i->seqname.get().c_str(), 0, INT_MAX, &seqlen);
            Logger::debug("read sequence %s.", i->seqname.get().c_str());

            if (seq == NULL) {
                Logger::warn("warning: reference sequence not found, skipping.");
            }
            else {
                for (char* c = seq; *c; c++) *c = tolower(*c);
                tbseq = seq;
                tbseqrc = tbseq;
                tbseqrc.revcomp();

            }

            curr_seqname = i->seqname;
        }

        if (seq == NULL || (pos_t) tbseq.size() < median_frag_len) continue;

        // fragments with many copies tend to have too much weight when training
        // leading to somewhat less than stable results.
        if (i->count > 4) continue;

        // sample background position
        boost::random::uniform_int_distribution<pos_t> random_uniform(
                i->start + L, i->end - median_frag_len);
        pos_t pos = random_uniform(rng);
        float gc = (float) gc_count(seq + pos, median_frag_len) / median_frag_len;
        float sb = seqbias[0] ?
                   seqbias[0]->get_bias(tbseq, pos - L) *
                   seqbias[1]->get_bias(tbseqrc, seqlen - pos - 1 - L) : 1.0;
        background_gc.push_back(WeightedGC(gc, 1.0 / sb));

        // sample foreground position
        if (i->strand == 0) {
            if (i->pos >= i->start && i->pos + median_frag_len - 1 <= i->end) {
                float sb = seqbias[0] ?
                           seqbias[0]->get_bias(tbseq, i->pos - L) *
                           seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - 1 - L) : 1.0;

                foreground_gc.push_back(
                    WeightedGC((float) gc_count(seq + i->pos, median_frag_len) / median_frag_len,
                               1.0 / sb));
            }
        } else {
            if (i->pos - median_frag_len >= i->start && i->pos <= i->end) {
                float sb = seqbias[0] ?
                           seqbias[0]->get_bias(tbseq, i->pos - median_frag_len - L) *
                           seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - median_frag_len - 1 - L) : 1.0;
                foreground_gc.push_back(
                    WeightedGC((float) gc_count(seq + i->pos - median_frag_len, median_frag_len) / median_frag_len,
                               1.0 /sb));
            }
        }
        task.inc();
    }

    free(seq);
    fai_destroy(ref_file);

#if 0
    FILE* out = fopen("gcbias.tsv", "w");
    fprintf(out, "group\tgc\tweight\n");
    BOOST_FOREACH (WeightedGC& value, foreground_gc) {
        fprintf(out, "foreground\t%f\t%f\n", (double) value.first, (double) value.second);
    }
Example #2
0
void sequencing_bias::build(const char* ref_fn,
                            PosTable& T,
                            size_t max_reads,
                            pos_t L, pos_t R,
                            const char* task_name,
                            double complexity_penalty)
{
    Logger::push_task(task_name);

    clear();
    const size_t min_positions = 1000;
    if (T.size() < min_positions) return;


    this->ref_fn = ref_fn;

    this->L = L;
    this->R = R;

    const size_t max_dump = 10000000;
    std::vector<ReadPos> S;
    S.reserve(max_dump);
    T.dump(S, max_dump);

    /* sort by tid (so we can load one chromosome at a time) */
    random_shuffle(S.begin(), S.end());
    sort(S.begin(), S.end(), ReadPosSeqnameCmp());

    //sort(S.begin(), S.end(), ReadPosCountCmp());
    //sort(S.begin(), S.begin() + max_reads, ReadPosSeqnameCmp());

    /* sample foreground and background kmer frequencies */
    ref_f = fai_load(ref_fn);
    if (ref_f == NULL) {
        Logger::abort("Can't open fasta file '%s'.", ref_fn);
    }

    std::deque<twobitseq*> foreground_seqs;
    std::deque<twobitseq*> background_seqs;

    /* background sampling */
    int bg_samples = 1; // make this many samples for each read
    int bg_sample_num; // keep track of the number of samples made
    pos_t bg_pos;

    int            seqlen    = 0;
    SeqName        curr_seqname;
    char*          seq       = NULL;

    char* local_seq;
    local_seq = new char[ L + R + 2 ];
    local_seq[L+R+1] = '\0';

    std::vector<ReadPos>::iterator i;
    for (i = S.begin(); i != S.end() && i != S.begin() + max_reads; ++i) {

        /* Load/switch sequences (chromosomes) as they are encountered in the
         * read stream. The idea here is to avoid thrashing by loading a large
         * sequence, but also avoid overloading memory by only loading one
         * chromosome at a time. */
        if (i->seqname != curr_seqname) {
            if (seq) free(seq);

            seq = faidx_fetch_seq(ref_f, i->seqname.get().c_str(), 0, INT_MAX, &seqlen);
            Logger::debug("read sequence %s.", i->seqname.get().c_str());

            if (seq == NULL) {
                Logger::warn("warning: reference sequence not found, skipping.");
            }
            else {
                for (char* c = seq; *c; c++) *c = tolower(*c);
            }

            curr_seqname = i->seqname;
        }

        if (seq == NULL) continue;

        /* add a foreground sequence */
        if (i->strand == strand_neg) {
            if (i->pos < R || i->pos >= seqlen - L) continue;
            memcpy(local_seq, seq + i->pos - R, (L+1+R)*sizeof(char));
            seqrc(local_seq, L+1+R);
        }
        else {
            if (i->pos < L || i->pos >= seqlen - R) continue;
            memcpy(local_seq, seq + (i->pos - L), (L+1+R)*sizeof(char));
        }

        if (strchr(local_seq, 'n') != NULL) continue;

        foreground_seqs.push_back(new twobitseq(local_seq));


        /* add a background sequence */
        /* adjust the current read position randomly, and sample */
        for (bg_sample_num = 0; bg_sample_num < bg_samples;) {
            random_uniform_int.param(
                boost::random::uniform_int_distribution<pos_t>::param_type(i->start, i->end));
            bg_pos = random_uniform_int(rng);

            if (i->strand == strand_neg) {
                if (bg_pos < R || bg_pos >= seqlen - L) continue;
                memcpy(local_seq, seq + bg_pos - R, (L+1+R)*sizeof(char));
                seqrc(local_seq, L+1+R);
            }
            else {
                if (bg_pos < L || bg_pos >= seqlen - R) continue;
                memcpy(local_seq, seq + (bg_pos-L), (L+1+R)*sizeof(char));
            }

            if (strchr(local_seq, 'n') != NULL) continue;

            background_seqs.push_back(new twobitseq(local_seq));
            bg_sample_num++;
        }
    }


    size_t max_parents  = 4;
    size_t max_distance = 10;

    /* A bit of a hack: if we are training on very few reads (a couple thousand,
     * as a opposed to tens of thousands), we tend to end up with too sparse of
     * a model. */
    if (foreground_seqs.size() < 10000) complexity_penalty = 0.25;

    M = new motif(background_seqs,
                  foreground_seqs,
                  L + 1 + R,
                  max_parents,
                  max_distance,
                  complexity_penalty,
                  task_name);

    std::deque<twobitseq*>::iterator seqit;
    for (seqit = background_seqs.begin(); seqit != background_seqs.end(); seqit++) {
        delete *seqit;
    }

    for (seqit = foreground_seqs.begin(); seqit != foreground_seqs.end(); seqit++) {
        delete *seqit;
    }

    free(seq);
    delete [] local_seq;

    Logger::pop_task(task_name);
}