C++ (Cpp) PosTable Examples

Programming Language: C++ (Cpp)

Class/Type: PosTable

Examples at hotexamples.com: 2

C++ (Cpp) PosTable - 2 examples found. These are the top rated real world C++ (Cpp) examples of PosTable extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

dump(2)

size(1)

Frequently Used Methods

dump (2)

size (1)

Example #1

Show file

File: gcbias.cpp Project: dcjones/isolator

GCBias::GCBias(const char* ref_filename, PosTable& foreground_position_table,
	           pos_t median_frag_len,
               sequencing_bias* seqbias[2],
               const char* task_name)
{
	faidx_t* ref_file = fai_load(ref_filename);
	if (!ref_file) {
        Logger::abort("Can't open fasta file '%s'.", ref_filename);
	}

	std::vector<ReadPos> foreground_positions;
    const size_t max_dump = 10000000;
	foreground_position_table.dump(foreground_positions, max_dump);
	std::sort(foreground_positions.begin(), foreground_positions.end(), ReadPosSeqnameCmp());

	Logger::push_task(task_name, foreground_positions.size());
	LoggerTask& task = Logger::get_task(task_name);

    typedef std::pair<float, float> WeightedGC;

	std::vector<WeightedGC> foreground_gc, background_gc;

    int  seqlen = 0;
    SeqName curr_seqname;
    char* seq = NULL;
    twobitseq tbseq;
    twobitseq tbseqrc;
    rng_t rng;

    pos_t L = seqbias[0] ? seqbias[0]->getL() : 0;

    std::vector<ReadPos>::iterator i;
    for (i = foreground_positions.begin(); i != foreground_positions.end(); ++i) {
        if (i->seqname != curr_seqname) {
            free(seq);
            seq = faidx_fetch_seq(ref_file, i->seqname.get().c_str(), 0, INT_MAX, &seqlen);
            Logger::debug("read sequence %s.", i->seqname.get().c_str());

            if (seq == NULL) {
                Logger::warn("warning: reference sequence not found, skipping.");
            }
            else {
                for (char* c = seq; *c; c++) *c = tolower(*c);
                tbseq = seq;
                tbseqrc = tbseq;
                tbseqrc.revcomp();

            }

            curr_seqname = i->seqname;
        }

        if (seq == NULL || (pos_t) tbseq.size() < median_frag_len) continue;

        // fragments with many copies tend to have too much weight when training
        // leading to somewhat less than stable results.
        if (i->count > 4) continue;

        // sample background position
        boost::random::uniform_int_distribution<pos_t> random_uniform(
                i->start + L, i->end - median_frag_len);
        pos_t pos = random_uniform(rng);
        float gc = (float) gc_count(seq + pos, median_frag_len) / median_frag_len;
        float sb = seqbias[0] ?
                   seqbias[0]->get_bias(tbseq, pos - L) *
                   seqbias[1]->get_bias(tbseqrc, seqlen - pos - 1 - L) : 1.0;
        background_gc.push_back(WeightedGC(gc, 1.0 / sb));

        // sample foreground position
        if (i->strand == 0) {
            if (i->pos >= i->start && i->pos + median_frag_len - 1 <= i->end) {
                float sb = seqbias[0] ?
                           seqbias[0]->get_bias(tbseq, i->pos - L) *
                           seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - 1 - L) : 1.0;

                foreground_gc.push_back(
                    WeightedGC((float) gc_count(seq + i->pos, median_frag_len) / median_frag_len,
                               1.0 / sb));
            }
        } else {
            if (i->pos - median_frag_len >= i->start && i->pos <= i->end) {
                float sb = seqbias[0] ?
                           seqbias[0]->get_bias(tbseq, i->pos - median_frag_len - L) *
                           seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - median_frag_len - 1 - L) : 1.0;
                foreground_gc.push_back(
                    WeightedGC((float) gc_count(seq + i->pos - median_frag_len, median_frag_len) / median_frag_len,
                               1.0 /sb));
            }
        }
        task.inc();
    }

    free(seq);
    fai_destroy(ref_file);

#if 0
    FILE* out = fopen("gcbias.tsv", "w");
    fprintf(out, "group\tgc\tweight\n");
    BOOST_FOREACH (WeightedGC& value, foreground_gc) {
        fprintf(out, "foreground\t%f\t%f\n", (double) value.first, (double) value.second);
    }

Example #2

Show file

File: sequencing_bias.cpp Project: blahah/isolator

void sequencing_bias::build(const char* ref_fn,
                            PosTable& T,
                            size_t max_reads,
                            pos_t L, pos_t R,
                            const char* task_name,
                            double complexity_penalty)
{
    Logger::push_task(task_name);

    clear();
    const size_t min_positions = 1000;
    if (T.size() < min_positions) return;


    this->ref_fn = ref_fn;

    this->L = L;
    this->R = R;

    const size_t max_dump = 10000000;
    std::vector<ReadPos> S;
    S.reserve(max_dump);
    T.dump(S, max_dump);

    /* sort by tid (so we can load one chromosome at a time) */
    random_shuffle(S.begin(), S.end());
    sort(S.begin(), S.end(), ReadPosSeqnameCmp());

    //sort(S.begin(), S.end(), ReadPosCountCmp());
    //sort(S.begin(), S.begin() + max_reads, ReadPosSeqnameCmp());

    /* sample foreground and background kmer frequencies */
    ref_f = fai_load(ref_fn);
    if (ref_f == NULL) {
        Logger::abort("Can't open fasta file '%s'.", ref_fn);
    }

    std::deque<twobitseq*> foreground_seqs;
    std::deque<twobitseq*> background_seqs;

    /* background sampling */
    int bg_samples = 1; // make this many samples for each read
    int bg_sample_num; // keep track of the number of samples made
    pos_t bg_pos;

    int            seqlen    = 0;
    SeqName        curr_seqname;
    char*          seq       = NULL;

    char* local_seq;
    local_seq = new char[ L + R + 2 ];
    local_seq[L+R+1] = '\0';

    std::vector<ReadPos>::iterator i;
    for (i = S.begin(); i != S.end() && i != S.begin() + max_reads; ++i) {

        /* Load/switch sequences (chromosomes) as they are encountered in the
         * read stream. The idea here is to avoid thrashing by loading a large
         * sequence, but also avoid overloading memory by only loading one
         * chromosome at a time. */
        if (i->seqname != curr_seqname) {
            if (seq) free(seq);

            seq = faidx_fetch_seq(ref_f, i->seqname.get().c_str(), 0, INT_MAX, &seqlen);
            Logger::debug("read sequence %s.", i->seqname.get().c_str());

            if (seq == NULL) {
                Logger::warn("warning: reference sequence not found, skipping.");
            }
            else {
                for (char* c = seq; *c; c++) *c = tolower(*c);
            }

            curr_seqname = i->seqname;
        }

        if (seq == NULL) continue;

        /* add a foreground sequence */
        if (i->strand == strand_neg) {
            if (i->pos < R || i->pos >= seqlen - L) continue;
            memcpy(local_seq, seq + i->pos - R, (L+1+R)*sizeof(char));
            seqrc(local_seq, L+1+R);
        }
        else {
            if (i->pos < L || i->pos >= seqlen - R) continue;
            memcpy(local_seq, seq + (i->pos - L), (L+1+R)*sizeof(char));
        }

        if (strchr(local_seq, 'n') != NULL) continue;

        foreground_seqs.push_back(new twobitseq(local_seq));


        /* add a background sequence */
        /* adjust the current read position randomly, and sample */
        for (bg_sample_num = 0; bg_sample_num < bg_samples;) {
            random_uniform_int.param(
                boost::random::uniform_int_distribution<pos_t>::param_type(i->start, i->end));
            bg_pos = random_uniform_int(rng);

            if (i->strand == strand_neg) {
                if (bg_pos < R || bg_pos >= seqlen - L) continue;
                memcpy(local_seq, seq + bg_pos - R, (L+1+R)*sizeof(char));
                seqrc(local_seq, L+1+R);
            }
            else {
                if (bg_pos < L || bg_pos >= seqlen - R) continue;
                memcpy(local_seq, seq + (bg_pos-L), (L+1+R)*sizeof(char));
            }

            if (strchr(local_seq, 'n') != NULL) continue;

            background_seqs.push_back(new twobitseq(local_seq));
            bg_sample_num++;
        }
    }


    size_t max_parents  = 4;
    size_t max_distance = 10;

    /* A bit of a hack: if we are training on very few reads (a couple thousand,
     * as a opposed to tens of thousands), we tend to end up with too sparse of
     * a model. */
    if (foreground_seqs.size() < 10000) complexity_penalty = 0.25;

    M = new motif(background_seqs,
                  foreground_seqs,
                  L + 1 + R,
                  max_parents,
                  max_distance,
                  complexity_penalty,
                  task_name);

    std::deque<twobitseq*>::iterator seqit;
    for (seqit = background_seqs.begin(); seqit != background_seqs.end(); seqit++) {
        delete *seqit;
    }

    for (seqit = foreground_seqs.begin(); seqit != foreground_seqs.end(); seqit++) {
        delete *seqit;
    }

    free(seq);
    delete [] local_seq;

    Logger::pop_task(task_name);
}