コード例 #1
0
/**
 * Extract reference sequence region for motif discovery in a fuzzy fashion.
 */
void CandidateRegionExtractor::extract_regions_by_fuzzy_alignment(bcf_hdr_t* h, bcf1_t* v,  Variant& variant)
{
    if (debug)
    {
        if (debug) std::cerr << "********************************************\n";
        std::cerr << "EXTRACTIING REGION BY FUZZY ALIGNMENT\n\n";
    }

    VNTR& vntr = variant.vntr;
    const char* chrom = bcf_get_chrom(h, v);

    int32_t min_beg1 = bcf_get_pos1(v);
    int32_t max_end1 = min_beg1;

    //merge candidate search region
    for (size_t i=1; i<bcf_get_n_allele(v); ++i)
    {
        std::string ref(bcf_get_alt(v, 0));
        std::string alt(bcf_get_alt(v, i));
        int32_t pos1 = bcf_get_pos1(v);

        trim(pos1, ref, alt);

        if (debug)
        {
            std::cerr << "indel fragment : " << (ref.size()<alt.size()? alt : ref) << "\n";
            std::cerr << "               : " << ref << ":" << alt << "\n";
        }

        min_beg1 = fuzzy_left_align(chrom, pos1, ref, alt, 3);
        max_end1 = fuzzy_right_align(chrom, pos1 + ref.size() - 1, ref, alt, 3);

        int32_t seq_len;
        char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);
        if (debug)
        {
            std::cerr << "FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
            std::cerr << "             " << seq << "\n";
        }

        if (seq_len) free(seq);
    }

    int32_t seq_len;
    char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);

    if (debug)
    {
        std::cerr << "FINAL FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
        std::cerr << "                   " << seq << "\n";
    }

    vntr.exact_repeat_tract = seq;
    vntr.exact_rbeg1 = min_beg1;

    if (seq_len) free(seq);
}
コード例 #2
0
ファイル: bam_plcmd.c プロジェクト: boratonAJ/gotcloud
static int mplp_get_ref(mplp_aux_t *ma, int tid,  char **ref, int *ref_len) {
    mplp_ref_t *r = ma->ref;

    //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]);

    if (!r || !ma->conf->fai) {
        *ref = NULL;
        return 0;
    }

    // Do we need to reference count this so multiple mplp_aux_t can
    // track which references are in use?
    // For now we just cache the last two. Sufficient?
    if (tid == r->ref_id[0]) {
        *ref = r->ref[0];
        *ref_len = r->ref_len[0];
        return 1;
    }
    if (tid == r->ref_id[1]) {
        // Last, swap over
        int tmp;
        tmp = r->ref_id[0];  r->ref_id[0]  = r->ref_id[1];  r->ref_id[1]  = tmp;
        tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp;

        char *tc;
        tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc;
        *ref = r->ref[0];
        *ref_len = r->ref_len[0];
        return 1;
    }

    // New, so migrate to old and load new
    if (r->ref[1])
        free(r->ref[1]);
    r->ref[1]     = r->ref[0];
    r->ref_id[1]  = r->ref_id[0];
    r->ref_len[1] = r->ref_len[0];

    r->ref_id[0] = tid;
    r->ref[0] = faidx_fetch_seq(ma->conf->fai,
                                ma->h->target_name[r->ref_id[0]],
                                0,
                                0x7fffffff,
                                &r->ref_len[0]);

    if (!r->ref) {
        r->ref[0] = NULL;
        r->ref_id[0] = -1;
        r->ref_len[0] = 0;
        *ref = NULL;
        return 0;
    }

    *ref = r->ref[0];
    *ref_len = r->ref_len[0];
    return 1;
}
コード例 #3
0
ファイル: samutils.c プロジェクト: CSB5/lofreq
/* check match between reference and bam files. prints an error
 * message and return non-zero on mismatch 
*/
int checkref(char *fasta_file, char *bam_file)
{
     int i = -1;
     bam_header_t *header;
     faidx_t *fai;
     char *ref;
     int ref_len = -1;
     bamFile bam_fp;
     
     if (! file_exists(fasta_file)) {
          LOG_FATAL("Fsata file %s does not exist. Exiting...\n", fasta_file);
          return 1;
     }     

     if (0 != strcmp(bam_file, "-")  && ! file_exists(bam_file)) {
          LOG_FATAL("BAM file %s does not exist. Exiting...\n", bam_file);
          return 1;
     }     

     bam_fp = strcmp(bam_file, "-") == 0 ? bam_dopen(fileno(stdin), "r") : bam_open(bam_file, "r");
     header = bam_header_read(bam_fp);
     if (!header) {
          LOG_FATAL("Failed to read BAM header from %s\n", bam_file);
          return 1;
     }
     
     fai = fai_load(fasta_file);
     if (!fai) {
          LOG_FATAL("Failed to fasta index for %s\n", fasta_file);
          return 1;
     }
     
     for (i=0; i < header->n_targets; i++) {
          LOG_DEBUG("BAM header target %d of %d: name=%s len=%d\n", 
                    i+1, header->n_targets, header->target_name[i], header->target_len[i]);
          
          ref = faidx_fetch_seq(fai, header->target_name[i], 
                                0, 0x7fffffff, &ref_len);
          if (NULL == ref) {
               LOG_FATAL("Failed to fetch sequence %s from fasta file\n", header->target_name[i]);
               return -1;
          }
          if (header->target_len[i] != ref_len) {
               LOG_FATAL("Sequence length mismatch for sequence %s (%dbp in fasta; %dbp in bam)\n", 
                         header->target_name[i], header->target_len[i], ref_len);
               return -1;
          }
          free(ref);
     }
     
     fai_destroy(fai);
     bam_header_destroy(header);
     bam_close(bam_fp);

     return 0;
}
コード例 #4
0
std::string IndexedFastaReader::Subsequence(const std::string& id,
                                            Position begin,
                                            Position end) const
{
    REQUIRE_FAIDX_LOADED;

    int len;
    // Derek: *Annoyingly* htslib seems to interpret "end" as inclusive in
    // faidx_fetch_seq, whereas it considers it exclusive in the region spec in
    // fai_fetch.  Can you please verify?
    char* rawSeq = faidx_fetch_seq(handle_, id.c_str(), begin, end - 1, &len);
    if (rawSeq == nullptr)
        throw std::runtime_error("could not fetch FASTA sequence");
    else {
        std::string seq(rawSeq);
        free(rawSeq);
        return seq;
    }
}
コード例 #5
0
/**
 * Right align alleles.
 */
void CandidateRegionExtractor::right_align(const char* chrom, int32_t& pos1, std::string& ref, std::string& alt)
{
    int32_t seq_len;
    char* seq;
    while (ref.at(0)==alt.at(0))
    {
        seq = faidx_fetch_seq(fai, chrom, pos1, pos1, &seq_len);
        if (seq_len)
        {
            ref.erase(0,1);
            alt.erase(0,1);
            ref.push_back(seq[0]);
            alt.push_back(seq[0]);
            free(seq);
            ++pos1;
        }
        else
        {
            fprintf(stderr, "[%s:%d %s] Cannot read from sequence file\n", __FILE__,__LINE__,__FUNCTION__);
            exit(1);
        }
    }
}
コード例 #6
0
/**
 * Left align alleles.
 */
void CandidateRegionExtractor::left_align(const char* chrom, int32_t& pos1, std::string& ref, std::string& alt)
{
    int32_t seq_len;
    char* seq;
    while (ref.at(ref.size()-1)==alt.at(alt.size()-1) && pos1>1)
    {
        seq = faidx_fetch_seq(fai, chrom, pos1-2, pos1-2, &seq_len);
        if (seq_len)
        {
            ref.erase(ref.size()-1,1);
            alt.erase(alt.size()-1,1);
            ref.insert(0, 1, seq[0]);
            alt.insert(0, 1, seq[0]);
            free(seq);
            --pos1;
        }
        else
        {
            fprintf(stderr, "[%s:%d %s] Cannot read from sequence file\n", __FILE__,__LINE__,__FUNCTION__);
            exit(1);
        }
    }
}
コード例 #7
0
ファイル: bam_plcmd.c プロジェクト: 9beckert/TIR
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
	extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
	extern void bcf_call_del_rghash(void *rghash);
	mplp_aux_t **data;
	int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
	const bam_pileup1_t **plp;
	bam_mplp_t iter;
	bam_header_t *h = 0;
	char *ref;
	void *rghash = 0;

	bcf_callaux_t *bca = 0;
	bcf_callret1_t *bcr = 0;
	bcf_call_t bc;
	bcf_t *bp = 0;
	bcf_hdr_t *bh = 0;

	bam_sample_t *sm = 0;
	kstring_t buf;
	mplp_pileup_t gplp;

	memset(&gplp, 0, sizeof(mplp_pileup_t));
	memset(&buf, 0, sizeof(kstring_t));
	memset(&bc, 0, sizeof(bcf_call_t));
	data = calloc(n, sizeof(void*));
	plp = calloc(n, sizeof(void*));
	n_plp = calloc(n, sizeof(int*));
	sm = bam_smpl_init();

	// read the header and initialize data
	for (i = 0; i < n; ++i) {
		bam_header_t *h_tmp;
		data[i] = calloc(1, sizeof(mplp_aux_t));
		data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
		data[i]->conf = conf;
		h_tmp = bam_header_read(data[i]->fp);
		data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
		bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
		rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
		if (conf->reg) {
			int beg, end;
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			if (idx == 0) {
				fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
				fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
			data[i]->iter = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
		if (i == 0) h = h_tmp;
		else {
			// FIXME: to check consistency
			bam_header_destroy(h_tmp);
		}
	}
	gplp.n = sm->n;
	gplp.n_plp = calloc(sm->n, sizeof(int));
	gplp.m_plp = calloc(sm->n, sizeof(int));
	gplp.plp = calloc(sm->n, sizeof(void*));

	fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
	// write the VCF header
	if (conf->flag & MPLP_GLF) {
		kstring_t s;
		bh = calloc(1, sizeof(bcf_hdr_t));
		s.l = s.m = 0; s.s = 0;
		bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
		for (i = 0; i < h->n_targets; ++i) {
			kputs(h->target_name[i], &s);
			kputc('\0', &s);
		}
		bh->l_nm = s.l;
		bh->name = malloc(s.l);
		memcpy(bh->name, s.s, s.l);
		s.l = 0;
		for (i = 0; i < sm->n; ++i) {
			kputs(sm->smpl[i], &s); kputc('\0', &s);
		}
		bh->l_smpl = s.l;
		bh->sname = malloc(s.l);
		memcpy(bh->sname, s.s, s.l);
		bh->txt = malloc(strlen(BAM_VERSION) + 64);
		bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
		free(s.s);
		bcf_hdr_sync(bh);
		bcf_hdr_write(bp, bh);
		bca = bcf_call_init(-1., conf->min_baseQ);
		bcr = calloc(sm->n, sizeof(bcf_callret1_t));
		bca->rghash = rghash;
		bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
		bca->min_frac = conf->min_frac;
		bca->min_support = conf->min_support;
	}
	if (tid0 >= 0 && conf->fai) { // region is set
		ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
		ref_tid = tid0;
		for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
	} else ref_tid = -1, ref = 0;
	iter = bam_mplp_init(n, mplp_func, (void**)data);
	max_depth = conf->max_depth;
	if (max_depth * sm->n > 1<<20)
		fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
	if (max_depth * sm->n < 8000) {
		max_depth = 8000 / sm->n;
		fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
	}
	max_indel_depth = conf->max_indel_depth * sm->n;
	bam_mplp_set_maxcnt(iter, max_depth);
	while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
		if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
		if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
		if (tid != ref_tid) {
			free(ref); ref = 0;
			if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
			for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
			ref_tid = tid;
		}
		if (conf->flag & MPLP_GLF) {
			int total_depth, _ref0, ref16;
			bcf1_t *b = calloc(1, sizeof(bcf1_t));
			for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
			group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
			_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
			ref16 = bam_nt16_table[_ref0];
			for (i = 0; i < gplp.n; ++i)
				bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
			bcf_call_combine(gplp.n, bcr, ref16, &bc);
			bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
						 (conf->flag&MPLP_FMT_SP), 0, 0);
			bcf_write(bp, bh, b);
			bcf_destroy(b);
			// call indels
			if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
				for (i = 0; i < gplp.n; ++i)
					bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
				if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
					b = calloc(1, sizeof(bcf1_t));
					bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
								 (conf->flag&MPLP_FMT_SP), bca, ref);
					bcf_write(bp, bh, b);
					bcf_destroy(b);
				}
			}
		} else {
			printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
			for (i = 0; i < n; ++i) {
				int j;
				printf("\t%d\t", n_plp[i]);
				if (n_plp[i] == 0) {
					printf("*\t*"); // FIXME: printf() is very slow...
					if (conf->flag & MPLP_PRINT_POS) printf("\t*");
				} else {
					for (j = 0; j < n_plp[i]; ++j)
						pileup_seq(plp[i] + j, pos, ref_len, ref);
					putchar('\t');
					for (j = 0; j < n_plp[i]; ++j) {
						const bam_pileup1_t *p = plp[i] + j;
						int c = bam1_qual(p->b)[p->qpos] + 33;
						if (c > 126) c = 126;
						putchar(c);
					}
					if (conf->flag & MPLP_PRINT_MAPQ) {
						putchar('\t');
						for (j = 0; j < n_plp[i]; ++j) {
							int c = plp[i][j].b->core.qual + 33;
							if (c > 126) c = 126;
							putchar(c);
						}
					}
					if (conf->flag & MPLP_PRINT_POS) {
						putchar('\t');
						for (j = 0; j < n_plp[i]; ++j) {
							if (j > 0) putchar(',');
							printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
						}
					}
				}
			}
			putchar('\n');
		}
	}

	bcf_close(bp);
	bam_smpl_destroy(sm); free(buf.s);
	for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
	free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
	bcf_call_del_rghash(rghash);
	bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
	bam_mplp_destroy(iter);
	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(plp); free(ref); free(n_plp);
	return 0;
}
コード例 #8
0
ファイル: gcbias.cpp プロジェクト: dcjones/isolator
GCBias::GCBias(const char* ref_filename, PosTable& foreground_position_table,
	           pos_t median_frag_len,
               sequencing_bias* seqbias[2],
               const char* task_name)
{
	faidx_t* ref_file = fai_load(ref_filename);
	if (!ref_file) {
        Logger::abort("Can't open fasta file '%s'.", ref_filename);
	}

	std::vector<ReadPos> foreground_positions;
    const size_t max_dump = 10000000;
	foreground_position_table.dump(foreground_positions, max_dump);
	std::sort(foreground_positions.begin(), foreground_positions.end(), ReadPosSeqnameCmp());

	Logger::push_task(task_name, foreground_positions.size());
	LoggerTask& task = Logger::get_task(task_name);

    typedef std::pair<float, float> WeightedGC;

	std::vector<WeightedGC> foreground_gc, background_gc;

    int  seqlen = 0;
    SeqName curr_seqname;
    char* seq = NULL;
    twobitseq tbseq;
    twobitseq tbseqrc;
    rng_t rng;

    pos_t L = seqbias[0] ? seqbias[0]->getL() : 0;

    std::vector<ReadPos>::iterator i;
    for (i = foreground_positions.begin(); i != foreground_positions.end(); ++i) {
        if (i->seqname != curr_seqname) {
            free(seq);
            seq = faidx_fetch_seq(ref_file, i->seqname.get().c_str(), 0, INT_MAX, &seqlen);
            Logger::debug("read sequence %s.", i->seqname.get().c_str());

            if (seq == NULL) {
                Logger::warn("warning: reference sequence not found, skipping.");
            }
            else {
                for (char* c = seq; *c; c++) *c = tolower(*c);
                tbseq = seq;
                tbseqrc = tbseq;
                tbseqrc.revcomp();

            }

            curr_seqname = i->seqname;
        }

        if (seq == NULL || (pos_t) tbseq.size() < median_frag_len) continue;

        // fragments with many copies tend to have too much weight when training
        // leading to somewhat less than stable results.
        if (i->count > 4) continue;

        // sample background position
        boost::random::uniform_int_distribution<pos_t> random_uniform(
                i->start + L, i->end - median_frag_len);
        pos_t pos = random_uniform(rng);
        float gc = (float) gc_count(seq + pos, median_frag_len) / median_frag_len;
        float sb = seqbias[0] ?
                   seqbias[0]->get_bias(tbseq, pos - L) *
                   seqbias[1]->get_bias(tbseqrc, seqlen - pos - 1 - L) : 1.0;
        background_gc.push_back(WeightedGC(gc, 1.0 / sb));

        // sample foreground position
        if (i->strand == 0) {
            if (i->pos >= i->start && i->pos + median_frag_len - 1 <= i->end) {
                float sb = seqbias[0] ?
                           seqbias[0]->get_bias(tbseq, i->pos - L) *
                           seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - 1 - L) : 1.0;

                foreground_gc.push_back(
                    WeightedGC((float) gc_count(seq + i->pos, median_frag_len) / median_frag_len,
                               1.0 / sb));
            }
        } else {
            if (i->pos - median_frag_len >= i->start && i->pos <= i->end) {
                float sb = seqbias[0] ?
                           seqbias[0]->get_bias(tbseq, i->pos - median_frag_len - L) *
                           seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - median_frag_len - 1 - L) : 1.0;
                foreground_gc.push_back(
                    WeightedGC((float) gc_count(seq + i->pos - median_frag_len, median_frag_len) / median_frag_len,
                               1.0 /sb));
            }
        }
        task.inc();
    }

    free(seq);
    fai_destroy(ref_file);

#if 0
    FILE* out = fopen("gcbias.tsv", "w");
    fprintf(out, "group\tgc\tweight\n");
    BOOST_FOREACH (WeightedGC& value, foreground_gc) {
        fprintf(out, "foreground\t%f\t%f\n", (double) value.first, (double) value.second);
    }
コード例 #9
0
/**
 * Fuzzy right align alleles allowing for mismatches and indels defined by penalty.
 *
 * @chrom   - chromosome
 * @pos1    - 1 based position
 * @ref     - reference sequence
 * @alt     - alternative sequence
 * @penalty - mismatch/indels allowed
 *
 * Returns right aligned position.
 */
uint32_t CandidateRegionExtractor::fuzzy_right_align(const char* chrom, int32_t pos1, std::string ref, std::string alt, uint32_t penalty)
{
    if (ref==alt)
    {
        return pos1;
    }

    int32_t seq_len;
    char* seq;
    while (ref.at(0)==alt.at(0))
    {
        seq = faidx_fetch_seq(fai, chrom, pos1, pos1, &seq_len);
        if (seq_len)
        {
            ref.erase(0,1);
            alt.erase(0,1);
            ref.push_back(seq[0]);
            alt.push_back(seq[0]);
            free(seq);
            ++pos1;
        }
        else
        {
            fprintf(stderr, "[%s:%d %s] Cannot read from sequence file\n", __FILE__,__LINE__,__FUNCTION__);
            exit(1);
        }
    }

    if (penalty)
    {
        uint32_t pos1_sub = pos1;
        uint32_t pos1_del = pos1;
        uint32_t pos1_ins = pos1;

        //substitution
        seq = faidx_fetch_seq(fai, chrom, pos1, pos1, &seq_len);
        if (seq_len)
        {
            std::string new_ref = ref;
            std::string new_alt = alt;
            new_ref.erase(0,1);
            new_alt.erase(0,1);
            new_ref.push_back(seq[0]);
            new_alt.push_back(seq[0]);
            //std::cerr << "\tsub: " << chrom << ":" << pos1+1 << ":" << new_ref << ":" << new_alt << " (" <<  penalty-1 << ")\n";
            pos1_sub = fuzzy_right_align(chrom, pos1+1, new_ref, new_alt, penalty-1);
        }
        else
        {
            fprintf(stderr, "[%s:%d %s] Cannot read from sequence file\n", __FILE__,__LINE__,__FUNCTION__);
            exit(1);
        }

        //deletion
        if (ref.size()>1)
        {
            std::string new_ref = ref;
            new_ref.erase(0,1);
            //std::cerr << "\tdel: " << chrom << ":" << pos1 << ":" << new_ref << ":" << alt << " (" <<  penalty-1 << ")\n";
            pos1_del = fuzzy_right_align(chrom, pos1, new_ref, alt, penalty-1);
        }

        //insertion
        if (alt.size()>1)
        {
            std::string new_alt = alt;
            new_alt.erase(0,1);
            //std::cerr << "\tins: " << chrom << ":" << pos1 << ":" << ref << ":" << new_alt << " (" <<  penalty-1 << ")\n";
            pos1_ins = fuzzy_right_align(chrom, pos1, ref, new_alt, penalty-1);
        }

        pos1 = std::max(pos1_sub, std::max(pos1_del, pos1_ins));
    }

    return pos1;

}
コード例 #10
0
/**
 * Extract reference sequence region for motif discovery.
 *
 * The input is a VCF record that contains an indel.
 * 
 * If the the indel has multiple alleles, it will examine all
 * alleles.
 *
 * todo: is might be a good idea to combine this step with motif detection
 *       since there seems to be a need to have an iterative process here
 *       to ensure a good candidate motif is chosen. *  
 */
void CandidateRegionExtractor::extract_regions_by_exact_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant)
{
    if (debug)
    {
        if (debug) std::cerr << "********************************************\n";
        std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n";
    }

    VNTR& vntr = variant.vntr;
    const char* chrom = bcf_get_chrom(h, v);

    int32_t min_beg1 = bcf_get_pos1(v);
    int32_t max_end1 = min_beg1;

    if (debug)
    {
       bcf_print_liten(h, v);
    }

    //merge candidate search region
    for (size_t i=1; i<bcf_get_n_allele(v); ++i)
    {
        std::string ref(bcf_get_alt(v, 0));
        std::string alt(bcf_get_alt(v, i));
        int32_t pos1 = bcf_get_pos1(v);

        //this prevents introduction of flanks that do not harbour the repeat unit
        trim(pos1, ref, alt);

        int32_t end1 = pos1 + ref.size() - 1;
        right_align(chrom, end1, ref, alt);

        int32_t beg1 = end1 - ref.size() + 1;
        left_align(chrom, beg1, ref, alt);

        min_beg1 = beg1<min_beg1 ? beg1 : min_beg1;
        max_end1 = end1>max_end1 ? end1 : max_end1;

        int32_t seq_len;
        char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);

        if (debug)
        {
            std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n";
            std::cerr << "             " << seq << "\n";
        }

        if (seq_len) free(seq);
    }

    int32_t seq_len;
    char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);

    if (debug)
    {
        std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
        std::cerr << "                   " << seq << "\n";
    }

    vntr.exact_repeat_tract = seq;
    vntr.rid = bcf_get_rid(v);
    vntr.exact_rbeg1 = min_beg1;
    vntr.exact_rend1 = max_end1;
    
    if (seq_len) free(seq);
}
コード例 #11
0
ファイル: bam_plcmd.c プロジェクト: Annak17/partis
/*
 * Performs pileup
 * @param conf configuration for this pileup
 * @param n number of files specified in fn
 * @param fn filenames
 */
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
    extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
    extern void bcf_call_del_rghash(void *rghash);
    mplp_aux_t **data;
    int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
    const bam_pileup1_t **plp;
    bam_mplp_t iter;
    bam_hdr_t *h = NULL; /* header of first file in input list */
    char *ref;
    void *rghash = NULL;
    FILE *pileup_fp = NULL;

    bcf_callaux_t *bca = NULL;
    bcf_callret1_t *bcr = NULL;
    bcf_call_t bc;
    htsFile *bcf_fp = NULL;
    bcf_hdr_t *bcf_hdr = NULL;

    bam_sample_t *sm = NULL;
    kstring_t buf;
    mplp_pileup_t gplp;

    memset(&gplp, 0, sizeof(mplp_pileup_t));
    memset(&buf, 0, sizeof(kstring_t));
    memset(&bc, 0, sizeof(bcf_call_t));
    data = calloc(n, sizeof(mplp_aux_t*));
    plp = calloc(n, sizeof(bam_pileup1_t*));
    n_plp = calloc(n, sizeof(int));
    sm = bam_smpl_init();

    if (n == 0) {
        fprintf(stderr,"[%s] no input file/data given\n", __func__);
        exit(1);
    }

    // read the header of each file in the list and initialize data
    for (i = 0; i < n; ++i) {
        bam_hdr_t *h_tmp;
        data[i] = calloc(1, sizeof(mplp_aux_t));
        data[i]->fp = sam_open(fn[i], "rb");
        if ( !data[i]->fp )
        {
            fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
            exit(1);
        }
        hts_set_fai_filename(data[i]->fp, conf->fai_fname);
        data[i]->conf = conf;
        h_tmp = sam_hdr_read(data[i]->fp);
        if ( !h_tmp ) {
            fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
            exit(1);
        }
        data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
        bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
        // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search)
        rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
        if (conf->reg) {
            hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]);
            if (idx == 0) {
                fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
                exit(1);
            }
            if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) {
                fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg);
                exit(1);
            }
            if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
            hts_idx_destroy(idx);
        }
        if (i == 0) h = h_tmp; /* save the header of first file in list */
        else {
            // FIXME: to check consistency
            bam_hdr_destroy(h_tmp);
        }
    }
    // allocate data storage proportionate to number of samples being studied sm->n
    gplp.n = sm->n;
    gplp.n_plp = calloc(sm->n, sizeof(int));
    gplp.m_plp = calloc(sm->n, sizeof(int));
    gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*));

    fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
    // write the VCF header
    if (conf->flag & MPLP_BCF)
    {
        const char *mode;
        if ( conf->flag & MPLP_VCF )
            mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz";   // uncompressed VCF or compressed VCF
        else
            mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb";  // uncompressed BCF or compressed BCF

        bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode);
        if (bcf_fp == NULL) {
            fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
            exit(1);
        }

        bcf_hdr = bcf_hdr_init("w");
        kstring_t str = {0,0,0};

        ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version());
        bcf_hdr_append(bcf_hdr, str.s);

        str.l = 0;
        ksprintf(&str, "##samtoolsCommand=samtools mpileup");
        for (i=1; i<conf->argc; i++) ksprintf(&str, " %s", conf->argv[i]);
        kputc('\n', &str);
        bcf_hdr_append(bcf_hdr, str.s);

        if (conf->fai_fname)
        {
            str.l = 0;
            ksprintf(&str, "##reference=file://%s\n", conf->fai_fname);
            bcf_hdr_append(bcf_hdr, str.s);
        }

        // todo: use/write new BAM header manipulation routines, fill also UR, M5
        for (i=0; i<h->n_targets; i++)
        {
            str.l = 0;
            ksprintf(&str, "##contig=<ID=%s,length=%d>", h->target_name[i], h->target_len[i]);
            bcf_hdr_append(bcf_hdr, str.s);
        }
        free(str.s);
        bcf_hdr_append(bcf_hdr,"##ALT=<ID=X,Description=\"Represents allele(s) other than observed.\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
#if CDF_MWU_TESTS
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
#endif
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
        bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
        if ( conf->fmt_flag&B2B_FMT_DP )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
        if ( conf->fmt_flag&B2B_FMT_DV )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
        if ( conf->fmt_flag&B2B_FMT_DPR )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
        if ( conf->fmt_flag&B2B_INFO_DPR )
            bcf_hdr_append(bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
        if ( conf->fmt_flag&B2B_FMT_DP4 )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
        if ( conf->fmt_flag&B2B_FMT_SP )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");

        for (i=0; i<sm->n; i++)
            bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]);
        bcf_hdr_add_sample(bcf_hdr, NULL);
        bcf_hdr_write(bcf_fp, bcf_hdr);

        bca = bcf_call_init(-1., conf->min_baseQ);
        bcr = calloc(sm->n, sizeof(bcf_callret1_t));
        bca->rghash = rghash;
        bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
        bca->min_frac = conf->min_frac;
        bca->min_support = conf->min_support;
        bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;

        bc.bcf_hdr = bcf_hdr;
        bc.n = sm->n;
        bc.PL = malloc(15 * sm->n * sizeof(*bc.PL));
        if (conf->fmt_flag)
        {
            assert( sizeof(float)==sizeof(int32_t) );
            bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4);
            bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields
            if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR) )
            {
                // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
                bc.DPR = malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t));
                for (i=0; i<sm->n; i++)
                    bcr[i].DPR = bc.DPR + (i+1)*B2B_MAX_ALLELES;
            }
        }
    }
    else {
        pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout;

        if (pileup_fp == NULL) {
            fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno));
            exit(1);
        }
    }

    if (tid0 >= 0 && conf->fai) { // region is set
        ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
        ref_tid = tid0;
        for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
    } else ref_tid = -1, ref = 0;

    // begin pileup
    iter = bam_mplp_init(n, mplp_func, (void**)data);
    if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
    max_depth = conf->max_depth;
    if (max_depth * sm->n > 1<<20)
        fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
    if (max_depth * sm->n < 8000) {
        max_depth = 8000 / sm->n;
        fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
    }
    max_indel_depth = conf->max_indel_depth * sm->n;
    bam_mplp_set_maxcnt(iter, max_depth);
    bcf1_t *bcf_rec = bcf_init1();
    int ret;
    while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
        if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
        if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
        if (tid != ref_tid) {
            free(ref); ref = 0;
            if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
            for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
            ref_tid = tid;
        }
        if (conf->flag & MPLP_BCF) {
            int total_depth, _ref0, ref16;
            for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
            group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
            _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
            ref16 = seq_nt16_table[_ref0];
            bcf_callaux_clean(bca, &bc);
            for (i = 0; i < gplp.n; ++i)
                bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
            bc.tid = tid; bc.pos = pos;
            bcf_call_combine(gplp.n, bcr, bca, ref16, &bc);
            bcf_clear1(bcf_rec);
            bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0);
            bcf_write1(bcf_fp, bcf_hdr, bcf_rec);
            // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
            if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0)
            {
                bcf_callaux_clean(bca, &bc);
                for (i = 0; i < gplp.n; ++i)
                    bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
                if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) {
                    bcf_clear1(bcf_rec);
                    bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref);
                    bcf_write1(bcf_fp, bcf_hdr, bcf_rec);
                }
            }
        } else {
            fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
            for (i = 0; i < n; ++i) {
                int j, cnt;
                for (j = cnt = 0; j < n_plp[i]; ++j) {
                    const bam_pileup1_t *p = plp[i] + j;
                    if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt;
                }
                fprintf(pileup_fp, "\t%d\t", cnt);
                if (n_plp[i] == 0) {
                    fputs("*\t*", pileup_fp);
                    if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp);
                    if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp);
                } else {
                    for (j = 0; j < n_plp[i]; ++j) {
                        const bam_pileup1_t *p = plp[i] + j;
                        if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ)
                            pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
                    }
                    putc('\t', pileup_fp);
                    for (j = 0; j < n_plp[i]; ++j) {
                        const bam_pileup1_t *p = plp[i] + j;
                        int c = bam_get_qual(p->b)[p->qpos];
                        if (c >= conf->min_baseQ) {
                            c = c + 33 < 126? c + 33 : 126;
                            putc(c, pileup_fp);
                        }
                    }
                    if (conf->flag & MPLP_PRINT_MAPQ) {
                        putc('\t', pileup_fp);
                        for (j = 0; j < n_plp[i]; ++j) {
                            const bam_pileup1_t *p = plp[i] + j;
                            int c = bam_get_qual(p->b)[p->qpos];
                            if ( c < conf->min_baseQ ) continue;
                            c = plp[i][j].b->core.qual + 33;
                            if (c > 126) c = 126;
                            putc(c, pileup_fp);
                        }
                    }
                    if (conf->flag & MPLP_PRINT_POS) {
                        putc('\t', pileup_fp);
                        for (j = 0; j < n_plp[i]; ++j) {
                            if (j > 0) putc(',', pileup_fp);
                            fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
                        }
                    }
                }
            }
            putc('\n', pileup_fp);
        }
    }

    // clean up
    free(bc.tmp.s);
    bcf_destroy1(bcf_rec);
    if (bcf_fp)
    {
        hts_close(bcf_fp);
        bcf_hdr_destroy(bcf_hdr);
        bcf_call_destroy(bca);
        free(bc.PL);
        free(bc.DP4);
        free(bc.DPR);
        free(bc.fmt_arr);
        free(bcr);
    }
    if (pileup_fp && conf->output_fname) fclose(pileup_fp);
    bam_smpl_destroy(sm); free(buf.s);
    for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
    free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
    bcf_call_del_rghash(rghash);
    bam_mplp_destroy(iter);
    bam_hdr_destroy(h);
    for (i = 0; i < n; ++i) {
        sam_close(data[i]->fp);
        if (data[i]->iter) hts_itr_destroy(data[i]->iter);
        free(data[i]);
    }
    free(data); free(plp); free(ref); free(n_plp);
    return ret;
}
コード例 #12
0
ファイル: flank_detector.cpp プロジェクト: kchennen/vt
/**
 * Detect repeat region.
 */
void FlankDetector::detect_flanks(bcf_hdr_t* h, bcf1_t *v, Variant& variant, uint32_t mode)
{
    VNTR& vntr = variant.vntr;

    //simple single base pair clipping of ends
    if (mode==CLIP_ENDS)
    {
        if (debug)
        {
            std::cerr << "********************************************\n";
            std::cerr << "CLIP ENDS\n";
            std:: cerr << "\n";
        }

        //in the case of simple indels, it is guaranteed that repeat tract length 
        //is always greater than 2 after exact alignment for deletions. This is 
        //because at least one end of the REF and ALT will end or start in the same base, 
        //and this allows the exact alignment detection of the repeat region to always 
        //increase by at least 1 base.
        //
        //exceptions:
        //
        //    simple insertion
        //    C[G]C  => [GC]
        //      GA
        //
        //    complex substitution
        //    C[G]C  => [G]
        //      TA
        if (vntr.exact_repeat_tract.size()>2) 
        {
            vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(1, vntr.exact_repeat_tract.size()-2);
            ++vntr.exact_rbeg1;
        }

        vntr.ru = choose_repeat_unit(vntr.exact_repeat_tract, vntr.motif);
        
//        std::cerr << "repeat tract : "
        
        vntr.exact_rend1 = vntr.exact_rbeg1+vntr.exact_rl-1;

        ahmm->set_model(vntr.ru.c_str());
        ahmm->align(vntr.exact_repeat_tract.c_str(), qual.c_str());

        vntr.exact_motif_concordance = ahmm->motif_concordance;
        vntr.exact_no_exact_ru = ahmm->exact_motif_count;
        vntr.exact_total_no_ru = ahmm->motif_count;
        vntr.exact_rl = ahmm->repeat_tract_len;

        if (debug)
        {
            vntr.print();
        }
    }
    else if (mode==FRAHMM)
    {
        if (debug)
        {
            std::cerr << "********************************************\n";
            std::cerr << "DETECTING REPEAT TRACT FUZZILY\n";
        }

        /////////////////
        //exact alignment
        /////////////////
        if (debug)
        {
            std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n";
            std::cerr << "Exact left/right alignment\n";
        }

        if (vntr.exact_repeat_tract.size()>2)
        {
            //removing the anchor bases
            if (vntr.mlen==1)
            {
                int32_t offset = 0;
                int32_t length = vntr.exact_repeat_tract.size();
                if (vntr.exact_repeat_tract.at(0)!=vntr.motif.at(0))
                {
                    offset = 1;
                    ++vntr.exact_rbeg1;
                }

                if (vntr.exact_repeat_tract.at(vntr.exact_repeat_tract.size()-1)!=vntr.motif.at(0))
                {
                    length -= offset+1;
                    --vntr.exact_rend1;
                }

                vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(offset, length);
            }
            else
            {
                if (vntr.exact_repeat_tract.size()>=3)
                {
                    vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(1, vntr.exact_repeat_tract.size()-2);
                    ++vntr.exact_rbeg1;
                    --vntr.exact_rend1;
                }
            }
        }
        //this is for nonexistent repeat units
        // 
        // RU : T
        // repeat_tract : G[T]C where T is an insert
        else if (vntr.exact_repeat_tract.size()==2)
        {
            
        }

        vntr.ru = choose_repeat_unit(vntr.exact_repeat_tract, vntr.motif);
        ahmm->set_model(vntr.ru.c_str());
        ahmm->align(vntr.exact_repeat_tract.c_str(), qual.c_str());

        vntr.exact_motif_concordance = ahmm->motif_concordance;
        vntr.exact_no_exact_ru = ahmm->exact_motif_count;
        vntr.exact_total_no_ru = ahmm->motif_count;
        vntr.exact_rl = ahmm->repeat_tract_len;

        if (debug)
        {
            std::cerr << "\n";
            std::cerr << "repeat_tract              : " << vntr.exact_repeat_tract << "\n";
            std::cerr << "position                  : [" << vntr.exact_rbeg1 << "," << vntr.exact_rend1 << "]\n";
            std::cerr << "motif_concordance         : " << vntr.exact_motif_concordance << "\n";
            std::cerr << "repeat units              : " << vntr.exact_rl << "\n";
            std::cerr << "exact repeat units        : " << vntr.exact_no_exact_ru << "\n";
            std::cerr << "total no. of repeat units : " << vntr.exact_total_no_ru << "\n";
            std::cerr << "\n";
        }

        ///////////////////////
        //fuzzy right alignment
        ///////////////////////
        if (debug)
        {
            std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n";
            std::cerr << "Fuzzy right alignment\n";
        }

        int32_t slen = 100;

        char* rflank;
        int32_t rflank_len;
        char* lflank;
        int32_t lflank_len;

        int32_t lflank_end1;
        int32_t rflank_beg1;

        char* seq;
        int32_t seq_len;
//        bool encountered_N = false;
        
        while (true)
        {
            //pick 5 bases to the right
            rflank = faidx_fetch_seq(fai, variant.chrom.c_str(), vntr.exact_rend1+1-1, vntr.exact_rend1+5-1, &rflank_len);

            //pick 105 bases for aligning

            seq = faidx_fetch_seq(fai, variant.chrom.c_str(), vntr.exact_rend1-slen-1, vntr.exact_rend1+5-1, &seq_len);
 
            
            rfhmm->set_model(vntr.ru.c_str(), rflank);
            rfhmm->align(seq, qual.c_str());
            if (debug) rfhmm->print_alignment();

            if (rflank_len) free(rflank);
            if (seq_len) free(seq);

            //////////////////////
            //fuzzy left alignment
            //////////////////////
            if (debug)
            {
                std::cerr << "\n";
                std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n";
                std::cerr << "Fuzzy left alignment\n";
            }

            //this is a hack around rfhmm rigidity in modeling the RUs
            //todo: we should change this to a reverse version of LFHMM!!!!
            if (rfhmm->get_lflank_read_epos1()>2*vntr.ru.size())
            {
                lflank_end1 = vntr.exact_rend1-slen-1+1 + rfhmm->get_lflank_read_epos1() - 1;
                break;
            }
            else if (slen==1000)
            {
                lflank_end1 = vntr.exact_rend1 - 1000 - 1;
                vntr.is_large_repeat_tract = true;
                break;
            }
            else
            {
                slen +=100;
                
                if (debug)
                    std::cerr << "extending the reference sequence for RFHMM : " << slen << "\n";
            }
        }

        slen = 100;

        //pick 5 bases to right
        while(true)
        {
            lflank = faidx_fetch_seq(fai, variant.chrom.c_str(), lflank_end1-5-1, lflank_end1-1, &lflank_len);

            //pick 105 bases for aligning
            seq = faidx_fetch_seq(fai, variant.chrom.c_str(), lflank_end1-5-1, lflank_end1+slen-1-1, &seq_len);

            lfhmm->set_model(lflank, vntr.ru.c_str());
            lfhmm->align(seq, qual.c_str());
            if (debug) lfhmm->print_alignment();

            if (seq_len) free(seq);

            if (lfhmm->get_rflank_read_epos1()!=INT32_MAX)
            {
                rflank_beg1 = lflank_end1 - 5 + lfhmm->get_rflank_read_spos1() -1;
                break;
            }
            else if (slen==1000)
            {
                rflank_beg1 = lflank_end1 + 1000;
                vntr.is_large_repeat_tract = true;
                break;
            }
            else
            {
                slen +=100;
                if (debug)
                    std::cerr << "extending the reference sequence for LFHMM : " << slen << "\n";
            }
        }

        if (lflank_len) free(lflank);

        lflank = faidx_fetch_seq(fai, variant.chrom.c_str(), lflank_end1-10-1, lflank_end1-1, &lflank_len);
        rflank = faidx_fetch_seq(fai, variant.chrom.c_str(), rflank_beg1-1, rflank_beg1 +10 -1 -1, &rflank_len);

        vntr.fuzzy_rbeg1 = lflank_end1+1;
        vntr.fuzzy_rend1 = rflank_beg1-1;
        int32_t repeat_tract_len;
        char* repeat_tract = faidx_fetch_seq(fai, variant.chrom.c_str(), lflank_end1, rflank_beg1-1-1, &repeat_tract_len);
        vntr.fuzzy_repeat_tract.assign(repeat_tract);
        if (repeat_tract_len) free(repeat_tract);
        vntr.fuzzy_motif_concordance = lfhmm->motif_concordance;
        vntr.fuzzy_no_exact_ru = lfhmm->exact_motif_count;
        vntr.fuzzy_total_no_ru = lfhmm->motif_count;
        vntr.fuzzy_rl = rflank_beg1-lflank_end1-1;

        if (lflank_len) free(lflank);
        if (rflank_len) free(rflank);

        if (debug)
        {
            std::cerr << "\n";
            vntr.print();
            std::cerr << "\n";
        }
    }

    //fill in flanks
    const char* chrom = variant.chrom.c_str();
    uint32_t pos1 = vntr.exact_rbeg1;
    int32_t len = 0;
    faidx_fetch_seq(fai, chrom, pos1-10, pos1-1, &len);
};
コード例 #13
0
ファイル: bam_plcmd.c プロジェクト: mwilkers/unceqr
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
	extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
	extern void bcf_call_del_rghash(void *rghash);
	mplp_aux_t **data;
	int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
	const bam_pileup1_t **plp;
	bam_mplp_t iter;
	bam_header_t *h = 0;
	char *ref;
	void *rghash = 0;

	bcf_callaux_t *bca = 0;
	bcf_callret1_t *bcr = 0;
	bcf_call_t bc;
	bcf_t *bp = 0;
	bcf_hdr_t *bh = 0;

	bam_sample_t *sm = 0;
	kstring_t buf;
	mplp_pileup_t gplp;

	memset(&gplp, 0, sizeof(mplp_pileup_t));
	memset(&buf, 0, sizeof(kstring_t));
	memset(&bc, 0, sizeof(bcf_call_t));
	data = calloc(n, sizeof(void*));
	plp = calloc(n, sizeof(void*));
	n_plp = calloc(n, sizeof(int*));
	sm = bam_smpl_init();

	// read the header and initialize data
	for (i = 0; i < n; ++i) {
		bam_header_t *h_tmp;
		data[i] = calloc(1, sizeof(mplp_aux_t));
		data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
		data[i]->conf = conf;
		h_tmp = bam_header_read(data[i]->fp);
		data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
		bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
		rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
		if (conf->reg) {
			int beg, end;
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			if (idx == 0) {
				fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
				fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
			data[i]->iter = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
		if (i == 0) h = h_tmp;
		else {
			// FIXME: to check consistency
			bam_header_destroy(h_tmp);
		}
	}
	gplp.n = sm->n;
	gplp.n_plp = calloc(sm->n, sizeof(int));
	gplp.m_plp = calloc(sm->n, sizeof(int));
	gplp.plp = calloc(sm->n, sizeof(void*));

	fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
	// write the VCF header
	if (conf->flag & MPLP_GLF) {
		kstring_t s;
		bh = calloc(1, sizeof(bcf_hdr_t));
		s.l = s.m = 0; s.s = 0;
		bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
		for (i = 0; i < h->n_targets; ++i) {
			kputs(h->target_name[i], &s);
			kputc('\0', &s);
		}
		bh->l_nm = s.l;
		bh->name = malloc(s.l);
		memcpy(bh->name, s.s, s.l);
		s.l = 0;
		for (i = 0; i < sm->n; ++i) {
			kputs(sm->smpl[i], &s); kputc('\0', &s);
		}
		bh->l_smpl = s.l;
		bh->sname = malloc(s.l);
		memcpy(bh->sname, s.s, s.l);
		bh->txt = malloc(strlen(BAM_VERSION) + 64);
		bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
		free(s.s);
		bcf_hdr_sync(bh);
		bcf_hdr_write(bp, bh);
		bca = bcf_call_init(-1., conf->min_baseQ);
		bcr = calloc(sm->n, sizeof(bcf_callret1_t));
		bca->rghash = rghash;
		bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
		bca->min_frac = conf->min_frac;
		bca->min_support = conf->min_support;
	}
	if (tid0 >= 0 && conf->fai) { // region is set
		ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
		ref_tid = tid0;
		for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
	} else ref_tid = -1, ref = 0;
	iter = bam_mplp_init(n, mplp_func, (void**)data);
	max_depth = conf->max_depth;
	if (max_depth * sm->n > 1<<20)
		fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
	if (max_depth * sm->n < 8000) {
		max_depth = 8000 / sm->n;
		fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
	}
	max_indel_depth = conf->max_indel_depth * sm->n;
	bam_mplp_set_maxcnt(iter, max_depth);


	int storeSize = 100;

	int delStore[2][100] = {{0},{0}};

	typedef char * mstring;

	while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
		if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
		if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
		if (tid != ref_tid) {
			free(ref); ref = 0;
			if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
			for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
			ref_tid = tid;
		}
		if (conf->flag & MPLP_GLF) {
			int total_depth, _ref0, ref16;
			bcf1_t *b = calloc(1, sizeof(bcf1_t));
			for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
			group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
			_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
			ref16 = bam_nt16_table[_ref0];
			for (i = 0; i < gplp.n; ++i)
				bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
			bcf_call_combine(gplp.n, bcr, ref16, &bc);
			bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
						 (conf->flag&MPLP_FMT_SP), 0, 0);
			bcf_write(bp, bh, b);
			bcf_destroy(b);
			// call indels
			if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
				for (i = 0; i < gplp.n; ++i)
					bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
				if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
					b = calloc(1, sizeof(bcf1_t));
					bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
								 (conf->flag&MPLP_FMT_SP), bca, ref);
					bcf_write(bp, bh, b);
					bcf_destroy(b);
				}
			}
		} else {
			printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
			for (i = 0; i < n; ++i) {
				int j;
				printf("\t%d\t", n_plp[i]);
				if (n_plp[i] == 0) {
					printf("*\t*"); // FIXME: printf() is very slow...
					if (conf->flag & MPLP_PRINT_POS) printf("\t*");
				} else {
					//MDW start					
					//for each position in the pileup column
					int charLen = 16;
					int countChars[ charLen ][2];
					int countiChars[ charLen ][2];

					int countGap[2]={0,0};

					//double qvTotal=0;
					int numStruck=0;
					int numGood=0;
					int tti;
					int ttj;
					mstring insAllele[100];
					int insAlleleCnt[100];
					int sf=0;
					int flag=0;

					//typedef char * string;
					char insStr0[10000];
					int iCnt0=0;

					char insStr1[10000];
					int iCnt1=0;

					char delStr0[10000];
					int dCnt0=0;

					char delStr1[10000];
					int dCnt1=0;


					float qposP[10000];
					int qposCnt=0;



					//initialize with zeros
						for(tti=0;tti<charLen;tti++){
						  countChars[tti][0]=0;
						  countChars[tti][1]=0;
						}

					// define repeat length here; look back up to 10 prior positions
					// start one position away.
					int replC=0; //
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos-1])==toupper(ref[pos-tti])){
							replC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}					
					int reprC=0; // 
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos+1])==toupper(ref[pos+tti])){
							reprC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}		
					int repT = replC;
					if(replC < reprC){
						repT=reprC;
					}



					for (j = 0; j < n_plp[i]; ++j){
						const bam_pileup1_t *p = plp[i] + j;
									
						/*
						SAME LOGIC AS pileup_seq()
						*/

						if(p->is_refskip){ // never count intron gaps in numStruck
							continue;
						}

						if(p->is_del){ // skip deletion gap, after first position which is the first aligned char
							continue;
						}

						if( 	p->b->core.qual < conf->min_mqToCount  || // mapping quality
							conf->maxrepC < (repT) || // max homopolymer run, this will not 
							(!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches
							p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) ||  // trimEnd is 1-based
							p->zf == 1 || // fusion tag
							p->ih > conf->maxIH  || // max hit index
							(p->nmd > conf->maxNM) || // max mismatch
							(conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs
							(conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary
							(conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup
							(conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY))  || // optionally strike secondary or dup
							(conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) ))   // optionally strike secondary, dup and QCfail


						){
							numStruck++;
							continue;
						}

						
						//printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]);


						if(!p->is_del && p->indel==0){
  						  countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++;
						  numGood++;			

						}else if(p->is_refskip){
						  countGap[ bam1_strand(p->b) ]++;
						}
						
						if(p->indel<0){
    						  numGood++;			
						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr0[dCnt0] =  ref[pos+tti];
							   dCnt0++;
							  }	
							  delStr0[dCnt0] = ',';
							  dCnt0++;
						  }else{
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr1[dCnt1] = ref[pos+tti];
							   dCnt1++;
							  }	
							  delStr1[dCnt1] = ',';
							  dCnt1++;
						  }



						}else if(p->indel>0){
						  numGood++;			

						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt0++;
							  }	
							  insStr0[iCnt0] = ',';
							  iCnt0++;
						  }else{
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt1++;
							  }	
							  insStr1[iCnt1] = ',';
							  iCnt1++;
						  }


						}
						//calculate position of variant within aligned read - no soft clips
						if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0  ){

						//distance to end; calculate distance to end of aligned read.  removes soft clips.
						int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd;
						qposP[qposCnt] = distToEnd;						  
						qposCnt++;	
						// printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd);
						}	
					}

					//

					//print A,C,G,T, by +/-
				        printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", 	countChars[1][0],countChars[1][1],
											countChars[2][0],countChars[2][1],
											countChars[4][0],countChars[4][1],
											countChars[8][0],countChars[8][1],
											countChars[7][0],countChars[7][1]);
					
					putchar('\t');
					for(tti=0;tti<dCnt0;tti++){
					  putchar(delStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<dCnt1;tti++){
					  putchar(delStr1[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt0;tti++){
					  putchar(insStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt1;tti++){
					  putchar(insStr1[tti]);
					}

					printf("\t%d\t%d",numGood,numStruck);					

					// get non-ref qpos variation

					float medqpos = -1;
					float medAbsDev = -1;
					if(qposCnt>0){
					  medqpos = median(qposCnt,qposP);
					  float absDev[qposCnt];
					  for(tti=0;tti<qposCnt;tti++){
						absDev[tti] = abs(medqpos - qposP[tti]);
					  }
					  medAbsDev = median(qposCnt-1,absDev);
					}
					printf("\t%f",medAbsDev);

					///END MDW
				}



			}
			putchar('\n');
		}
	}

	bcf_close(bp);
	bam_smpl_destroy(sm); free(buf.s);
	for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
	free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
	bcf_call_del_rghash(rghash);
	bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
	bam_mplp_destroy(iter);
	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(plp); free(ref); free(n_plp);
	return 0;
}
コード例 #14
0
ファイル: extract.c プロジェクト: bgruening/PileOMeth
void extractCalls(Config *config) {
    bam_hdr_t *hdr = sam_hdr_read(config->fp);
    bam_mplp_t iter;
    int ret, tid, pos, i, seqlen, type, rv, o = 0;
    int beg0 = 0, end0 = 1u<<29;
    int n_plp; //This will need to be modified for multiple input files
    int ctid = -1; //The tid of the contig whose sequence is stored in "seq"
    int idxBED = 0, strand;
    uint32_t nmethyl = 0, nunmethyl = 0;
    const bam_pileup1_t **plp = NULL;
    char *seq = NULL, base;
    mplp_data *data = NULL;
    struct lastCall *lastCpG = NULL;
    struct lastCall *lastCHG = NULL;

    if(config->merge) {
        if(config->keepCpG) {
            lastCpG = calloc(1, sizeof(struct lastCall));
            assert(lastCpG);
            lastCpG->tid = -1;
        }
        if(config->keepCHG) {
            lastCHG = calloc(1, sizeof(struct lastCall));
            assert(lastCHG);
            lastCHG->tid = -1;
        }
    }

    data = calloc(1,sizeof(mplp_data));
    if(data == NULL) {
        fprintf(stderr, "Couldn't allocate space for the data structure in extractCalls()!\n");
        return;
    }
    data->config = config;
    data->hdr = hdr;
    if (config->reg) {
        if((data->iter = sam_itr_querys(config->bai, hdr, config->reg)) == 0) {
            fprintf(stderr, "failed to parse regions %s", config->reg);
            return;
        }
    }
    if(config->bedName) {
        config->bed = parseBED(config->bedName, hdr);
        if(config->bed == NULL) return;
    }

    plp = calloc(1, sizeof(bam_pileup1_t *)); //This will have to be modified for multiple input files
    if(plp == NULL) {
        fprintf(stderr, "Couldn't allocate space for the plp structure in extractCalls()!\n");
        return;
    }

    //Start the pileup
    iter = bam_mplp_init(1, filter_func, (void **) &data);
    bam_mplp_init_overlaps(iter);
    bam_mplp_set_maxcnt(iter, config->maxDepth);
    while((ret = cust_mplp_auto(iter, &tid, &pos, &n_plp, plp)) > 0) {
        //Do we need to process this position?
        if (config->reg) {
            beg0 = data->iter->beg, end0 = data->iter->end;
            if ((pos < beg0 || pos >= end0)) continue; // out of the region requested
        }
        if(tid != ctid) {
            if(seq != NULL) free(seq);
            seq = faidx_fetch_seq(config->fai, hdr->target_name[tid], 0, faidx_seq_len(config->fai, hdr->target_name[tid]), &seqlen);
            if(seqlen < 0) {
                fprintf(stderr, "faidx_fetch_seq returned %i while trying to fetch the sequence for tid %i (%s)!\n",\
                        seqlen, tid, hdr->target_name[tid]);
                fprintf(stderr, "Note that the output will be truncated!\n");
                return;
            }
            ctid = tid;
        }

        if(config->bed) { //Handle -l
            while((o = posOverlapsBED(tid, pos, config->bed, idxBED)) == -1) idxBED++;
            if(o == 0) continue; //Wrong strand
        }

        if(isCpG(seq, pos, seqlen)) {
            if(!config->keepCpG) continue;
            type = 0;
        } else if(isCHG(seq, pos, seqlen)) {
            if(!config->keepCHG) continue;
            type = 1;
        } else if(isCHH(seq, pos, seqlen)) {
            if(!config->keepCHH) continue;
            type = 2;
        } else {
            continue;
        }

        nmethyl = nunmethyl = 0;
        base = *(seq+pos);
        for(i=0; i<n_plp; i++) {
            if(plp[0][i].is_del) continue;
            if(plp[0][i].is_refskip) continue;
            if(config->bed) if(!readStrandOverlapsBED(plp[0][i].b, config->bed->region[idxBED])) continue;
            strand = getStrand((plp[0]+i)->b);
            if(strand & 1) {
                if(base != 'C' && base != 'c') continue;
            } else {
                if(base != 'G' && base != 'g') continue;
            }
            rv = updateMetrics(config, plp[0]+i);
            if(rv > 0) nmethyl++;
            else if(rv<0) nunmethyl++;
        }

        if(nmethyl+nunmethyl==0) continue;
        if(!config->merge || type==2) {
            writeCall(config->output_fp[type], config, hdr->target_name[tid], pos, 1, nmethyl, nunmethyl);
        } else {
            //Merge into per-CpG/CHG metrics
            if(type==0) {
                if(base=='G' || base=='g') pos--;
                processLast(config->output_fp[0], config, lastCpG, hdr, tid, pos, 2, nmethyl, nunmethyl);
            } else {
                if(base=='G' || base=='g') pos-=2;
                processLast(config->output_fp[1], config, lastCHG, hdr, tid, pos, 3, nmethyl, nunmethyl);
            }
        }
    }

    //Don't forget the last CpG/CHG
    if(config->merge) {
        if(config->keepCpG && lastCpG->tid != -1) {
            processLast(config->output_fp[0], config, lastCpG, hdr, tid, pos, 2, nmethyl, nunmethyl);
        }
        if(config->keepCHG && lastCHG->tid != -1) {
            processLast(config->output_fp[1], config, lastCHG, hdr, tid, pos, 3, nmethyl, nunmethyl);
        }
    }

    bam_hdr_destroy(hdr);
    if(data->iter) hts_itr_destroy(data->iter);
    bam_mplp_destroy(iter);
    free(data);
    free(plp);
    if(seq != NULL) free(seq);
}
コード例 #15
0
int main_ld(int argc, char *argv[])
{
    int chr;                  //! chromosome identifier
    int beg;                  //! beginning coordinate for analysis
    int end;                  //! end coordinate for analysis
    int ref;                  //! ref
    long num_windows;         //! number of windows
    std::string msg;          //! string for error message
    bam_plbuf_t *buf;         //! pileup buffer
    ldData t;

    // parse the command line options
    std::string region = t.parseCommandLine(argc, argv);

    // check input BAM file for errors
    t.checkBAM();

    // initialize the sample data structure
    t.bam_smpl_init();

    // add samples
    t.bam_smpl_add();

    // initialize error model
    t.em = errmod_init(1.0-0.83);

    // parse genomic region
    int k = bam_parse_region(t.h, region, &chr, &beg, &end);
    if (k < 0)
    {
        msg = "Bad genome coordinates: " + region;
        fatal_error(msg, __FILE__, __LINE__, 0);
    }

    // fetch reference sequence
    t.ref_base = faidx_fetch_seq(t.fai_file, t.h->target_name[chr], 0, 0x7fffffff, &(t.len));

    // calculate the number of windows
    if (t.flag & BAM_WINDOW)
        num_windows = ((end-beg)-1)/t.win_size;
    else
    {
        t.win_size = (end-beg);
        num_windows = 1;
    }

    // iterate through all windows along specified genomic region
    for (long cw=0; cw < num_windows; cw++)
    {

        // construct genome coordinate string
        std::string scaffold_name(t.h->target_name[chr]);
        std::ostringstream winc(scaffold_name);
        winc.seekp(0, std::ios::end);
        winc << ":" << beg+(cw*t.win_size)+1 << "-" << ((cw+1)*t.win_size)+(beg-1);
        std::string winCoord = winc.str();

        // initialize number of sites to zero
        t.num_sites = 0;

        // parse the BAM file and check if region is retrieved from the reference
        if (t.flag & BAM_WINDOW)
        {
            k = bam_parse_region(t.h, winCoord, &ref, &(t.beg), &(t.end));
            if (k < 0)
            {
                msg = "Bad window coordinates " + winCoord;
                fatal_error(msg, __FILE__, __LINE__, 0);
            }
        }
        else
        {
            ref = chr;
            t.beg = beg;
            t.end = end;
            if (ref < 0)
            {
                msg = "Bad scaffold name: " + region;
                fatal_error(msg, __FILE__, __LINE__, 0);
            }
        }

        // initialize nucdiv variables
        t.init_ld();

        // create population assignments
        t.assign_pops();

        // initialize pileup
        buf = bam_plbuf_init(make_ld, &t);

        // fetch region from bam file
        if ((bam_fetch(t.bam_in->x.bam, t.idx, ref, t.beg, t.end, buf, fetch_func)) < 0)
        {
            msg = "Failed to retrieve region " + region + " due to corrupted BAM index file";
            fatal_error(msg, __FILE__, __LINE__, 0);
        }

        // finalize pileup
        bam_plbuf_push(0, buf);

        // calculate linkage disequilibrium statistics
        ld_func fp[3] = {&ldData::calc_zns, &ldData::calc_omegamax, &ldData::calc_wall};
        (t.*fp[t.output])();

        // print results to stdout
        t.print_ld(chr);

        // take out the garbage
        t.destroy_ld();
        bam_plbuf_destroy(buf);
    }
    // end of window interation

    errmod_destroy(t.em);
    samclose(t.bam_in);
    bam_index_destroy(t.idx);
    t.bam_smpl_destroy();
    free(t.ref_base);

    return 0;
}
コード例 #16
0
ファイル: test_realn.c プロジェクト: atks/vt
int main(int argc, char **argv) {
    htsFile *in = NULL;
    htsFile *out = NULL;
    char *in_name = "-";
    char *out_name = "-";
    char *ref_name = NULL;
    char *ref_seq = NULL;
    char modew[8] = "w";
    faidx_t *fai = NULL;
    bam_hdr_t *hdr = NULL;
    bam1_t *rec = NULL;
    int c, res, last_ref = -1, ref_len = 0;
    int adjust = 0, extended = 0, recalc = 0, flags = 0;

    while ((c = getopt(argc, argv, "aef:hi:o:r")) >= 0) {
        switch (c) {
        case 'a': adjust = 1; break;
        case 'e': extended = 1; break;
        case 'f': ref_name = optarg; break;
        case 'h': usage(argv[0]); return EXIT_SUCCESS;
        case 'i': in_name = optarg; break;
        case 'o': out_name = optarg; break;
        case 'r': recalc = 1; break;
        default: usage(argv[0]); return EXIT_FAILURE;
        }
    }

    if (!ref_name) {
        usage(argv[0]);
        return EXIT_FAILURE;
    }

    flags = (adjust ? 1 : 0) | (extended ? 2 : 0) | (recalc ? 4 : 0);

    fai = fai_load(ref_name);
    if (!fai) {
        fprintf(stderr, "Couldn't load reference %s\n", ref_name);
        goto fail;
    }

    rec = bam_init1();
    if (!rec) {
        perror(NULL);
        goto fail;
    }

    in = hts_open(in_name, "r");
    if (!in) {
        fprintf(stderr, "Couldn't open %s : %s\n", in_name, strerror(errno));
        goto fail;
    }

    hdr = sam_hdr_read(in);
    if (!hdr) {
        fprintf(stderr, "Couldn't read header for %s\n", in_name);
        goto fail;
    }

    out = hts_open(out_name, modew);
    if (!out) {
        fprintf(stderr, "Couldn't open %s : %s\n", out_name, strerror(errno));
        goto fail;
    }

    if (sam_hdr_write(out, hdr) < 0) {
        fprintf(stderr, "Couldn't write header to %s : %s\n",
                out_name, strerror(errno));
        goto fail;
    }

    while ((res = sam_read1(in, hdr, rec)) >= 0) {
        if (rec->core.tid >= hdr->n_targets) {
            fprintf(stderr, "Invalid BAM reference id %d\n", rec->core.tid);
            goto fail;
        }
        if (last_ref != rec->core.tid && rec->core.tid >= 0) {
            free(ref_seq);
            ref_seq = faidx_fetch_seq(fai, hdr->target_name[rec->core.tid],
                                      0, INT_MAX, &ref_len);
            if (!ref_seq) {
                fprintf(stderr, "Couldn't get reference %s\n",
                        hdr->target_name[rec->core.tid]);
                goto fail;
            }
            last_ref = rec->core.tid;
        }
        if (rec->core.tid >= 0) {
            res = sam_prob_realn(rec, ref_seq, ref_len, flags);
            if (res <= -4) {
                fprintf(stderr, "Error running sam_prob_realn : %s\n",
                        strerror(errno));
                goto fail;
            }
        }
        if (sam_write1(out, hdr, rec) < 0) {
            fprintf(stderr, "Error writing to %s\n", out_name);
            goto fail;
        }
    }
    res = hts_close(in);
    in = NULL;
    if (res < 0) {
        fprintf(stderr, "Error closing %s\n", in_name);
        goto fail;
    }

    res = hts_close(out);
    out = NULL;
    if (res < 0) {
        fprintf(stderr, "Error closing %s\n", out_name);
        goto fail;
    }

    bam_hdr_destroy(hdr);
    bam_destroy1(rec);
    free(ref_seq);
    fai_destroy(fai);

    return EXIT_SUCCESS;

 fail:
    if (hdr) bam_hdr_destroy(hdr);
    if (rec) bam_destroy1(rec);
    if (in) hts_close(in);
    if (out) hts_close(out);
    free(ref_seq);
    fai_destroy(fai);
    return EXIT_FAILURE;
}