/** * Extract reference sequence region for motif discovery in a fuzzy fashion. */ void CandidateRegionExtractor::extract_regions_by_fuzzy_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { if (debug) std::cerr << "********************************************\n"; std::cerr << "EXTRACTIING REGION BY FUZZY ALIGNMENT\n\n"; } VNTR& vntr = variant.vntr; const char* chrom = bcf_get_chrom(h, v); int32_t min_beg1 = bcf_get_pos1(v); int32_t max_end1 = min_beg1; //merge candidate search region for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); trim(pos1, ref, alt); if (debug) { std::cerr << "indel fragment : " << (ref.size()<alt.size()? alt : ref) << "\n"; std::cerr << " : " << ref << ":" << alt << "\n"; } min_beg1 = fuzzy_left_align(chrom, pos1, ref, alt, 3); max_end1 = fuzzy_right_align(chrom, pos1 + ref.size() - 1, ref, alt, 3); int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } if (seq_len) free(seq); } int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FINAL FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } vntr.exact_repeat_tract = seq; vntr.exact_rbeg1 = min_beg1; if (seq_len) free(seq); }
static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) { mplp_ref_t *r = ma->ref; //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]); if (!r || !ma->conf->fai) { *ref = NULL; return 0; } // Do we need to reference count this so multiple mplp_aux_t can // track which references are in use? // For now we just cache the last two. Sufficient? if (tid == r->ref_id[0]) { *ref = r->ref[0]; *ref_len = r->ref_len[0]; return 1; } if (tid == r->ref_id[1]) { // Last, swap over int tmp; tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp; tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp; char *tc; tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc; *ref = r->ref[0]; *ref_len = r->ref_len[0]; return 1; } // New, so migrate to old and load new if (r->ref[1]) free(r->ref[1]); r->ref[1] = r->ref[0]; r->ref_id[1] = r->ref_id[0]; r->ref_len[1] = r->ref_len[0]; r->ref_id[0] = tid; r->ref[0] = faidx_fetch_seq(ma->conf->fai, ma->h->target_name[r->ref_id[0]], 0, 0x7fffffff, &r->ref_len[0]); if (!r->ref) { r->ref[0] = NULL; r->ref_id[0] = -1; r->ref_len[0] = 0; *ref = NULL; return 0; } *ref = r->ref[0]; *ref_len = r->ref_len[0]; return 1; }
/* check match between reference and bam files. prints an error * message and return non-zero on mismatch */ int checkref(char *fasta_file, char *bam_file) { int i = -1; bam_header_t *header; faidx_t *fai; char *ref; int ref_len = -1; bamFile bam_fp; if (! file_exists(fasta_file)) { LOG_FATAL("Fsata file %s does not exist. Exiting...\n", fasta_file); return 1; } if (0 != strcmp(bam_file, "-") && ! file_exists(bam_file)) { LOG_FATAL("BAM file %s does not exist. Exiting...\n", bam_file); return 1; } bam_fp = strcmp(bam_file, "-") == 0 ? bam_dopen(fileno(stdin), "r") : bam_open(bam_file, "r"); header = bam_header_read(bam_fp); if (!header) { LOG_FATAL("Failed to read BAM header from %s\n", bam_file); return 1; } fai = fai_load(fasta_file); if (!fai) { LOG_FATAL("Failed to fasta index for %s\n", fasta_file); return 1; } for (i=0; i < header->n_targets; i++) { LOG_DEBUG("BAM header target %d of %d: name=%s len=%d\n", i+1, header->n_targets, header->target_name[i], header->target_len[i]); ref = faidx_fetch_seq(fai, header->target_name[i], 0, 0x7fffffff, &ref_len); if (NULL == ref) { LOG_FATAL("Failed to fetch sequence %s from fasta file\n", header->target_name[i]); return -1; } if (header->target_len[i] != ref_len) { LOG_FATAL("Sequence length mismatch for sequence %s (%dbp in fasta; %dbp in bam)\n", header->target_name[i], header->target_len[i], ref_len); return -1; } free(ref); } fai_destroy(fai); bam_header_destroy(header); bam_close(bam_fp); return 0; }
std::string IndexedFastaReader::Subsequence(const std::string& id, Position begin, Position end) const { REQUIRE_FAIDX_LOADED; int len; // Derek: *Annoyingly* htslib seems to interpret "end" as inclusive in // faidx_fetch_seq, whereas it considers it exclusive in the region spec in // fai_fetch. Can you please verify? char* rawSeq = faidx_fetch_seq(handle_, id.c_str(), begin, end - 1, &len); if (rawSeq == nullptr) throw std::runtime_error("could not fetch FASTA sequence"); else { std::string seq(rawSeq); free(rawSeq); return seq; } }
/** * Right align alleles. */ void CandidateRegionExtractor::right_align(const char* chrom, int32_t& pos1, std::string& ref, std::string& alt) { int32_t seq_len; char* seq; while (ref.at(0)==alt.at(0)) { seq = faidx_fetch_seq(fai, chrom, pos1, pos1, &seq_len); if (seq_len) { ref.erase(0,1); alt.erase(0,1); ref.push_back(seq[0]); alt.push_back(seq[0]); free(seq); ++pos1; } else { fprintf(stderr, "[%s:%d %s] Cannot read from sequence file\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } } }
/** * Left align alleles. */ void CandidateRegionExtractor::left_align(const char* chrom, int32_t& pos1, std::string& ref, std::string& alt) { int32_t seq_len; char* seq; while (ref.at(ref.size()-1)==alt.at(alt.size()-1) && pos1>1) { seq = faidx_fetch_seq(fai, chrom, pos1-2, pos1-2, &seq_len); if (seq_len) { ref.erase(ref.size()-1,1); alt.erase(alt.size()-1,1); ref.insert(0, 1, seq[0]); alt.insert(0, 1, seq[0]); free(seq); --pos1; } else { fprintf(stderr, "[%s:%d %s] Cannot read from sequence file\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } } }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { for (j = 0; j < n_plp[i]; ++j) pileup_seq(plp[i] + j, pos, ref_len, ref); putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam1_qual(p->b)[p->qpos] + 33; if (c > 126) c = 126; putchar(c); } if (conf->flag & MPLP_PRINT_MAPQ) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { int c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putchar(c); } } if (conf->flag & MPLP_PRINT_POS) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putchar(','); printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
GCBias::GCBias(const char* ref_filename, PosTable& foreground_position_table, pos_t median_frag_len, sequencing_bias* seqbias[2], const char* task_name) { faidx_t* ref_file = fai_load(ref_filename); if (!ref_file) { Logger::abort("Can't open fasta file '%s'.", ref_filename); } std::vector<ReadPos> foreground_positions; const size_t max_dump = 10000000; foreground_position_table.dump(foreground_positions, max_dump); std::sort(foreground_positions.begin(), foreground_positions.end(), ReadPosSeqnameCmp()); Logger::push_task(task_name, foreground_positions.size()); LoggerTask& task = Logger::get_task(task_name); typedef std::pair<float, float> WeightedGC; std::vector<WeightedGC> foreground_gc, background_gc; int seqlen = 0; SeqName curr_seqname; char* seq = NULL; twobitseq tbseq; twobitseq tbseqrc; rng_t rng; pos_t L = seqbias[0] ? seqbias[0]->getL() : 0; std::vector<ReadPos>::iterator i; for (i = foreground_positions.begin(); i != foreground_positions.end(); ++i) { if (i->seqname != curr_seqname) { free(seq); seq = faidx_fetch_seq(ref_file, i->seqname.get().c_str(), 0, INT_MAX, &seqlen); Logger::debug("read sequence %s.", i->seqname.get().c_str()); if (seq == NULL) { Logger::warn("warning: reference sequence not found, skipping."); } else { for (char* c = seq; *c; c++) *c = tolower(*c); tbseq = seq; tbseqrc = tbseq; tbseqrc.revcomp(); } curr_seqname = i->seqname; } if (seq == NULL || (pos_t) tbseq.size() < median_frag_len) continue; // fragments with many copies tend to have too much weight when training // leading to somewhat less than stable results. if (i->count > 4) continue; // sample background position boost::random::uniform_int_distribution<pos_t> random_uniform( i->start + L, i->end - median_frag_len); pos_t pos = random_uniform(rng); float gc = (float) gc_count(seq + pos, median_frag_len) / median_frag_len; float sb = seqbias[0] ? seqbias[0]->get_bias(tbseq, pos - L) * seqbias[1]->get_bias(tbseqrc, seqlen - pos - 1 - L) : 1.0; background_gc.push_back(WeightedGC(gc, 1.0 / sb)); // sample foreground position if (i->strand == 0) { if (i->pos >= i->start && i->pos + median_frag_len - 1 <= i->end) { float sb = seqbias[0] ? seqbias[0]->get_bias(tbseq, i->pos - L) * seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - 1 - L) : 1.0; foreground_gc.push_back( WeightedGC((float) gc_count(seq + i->pos, median_frag_len) / median_frag_len, 1.0 / sb)); } } else { if (i->pos - median_frag_len >= i->start && i->pos <= i->end) { float sb = seqbias[0] ? seqbias[0]->get_bias(tbseq, i->pos - median_frag_len - L) * seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - median_frag_len - 1 - L) : 1.0; foreground_gc.push_back( WeightedGC((float) gc_count(seq + i->pos - median_frag_len, median_frag_len) / median_frag_len, 1.0 /sb)); } } task.inc(); } free(seq); fai_destroy(ref_file); #if 0 FILE* out = fopen("gcbias.tsv", "w"); fprintf(out, "group\tgc\tweight\n"); BOOST_FOREACH (WeightedGC& value, foreground_gc) { fprintf(out, "foreground\t%f\t%f\n", (double) value.first, (double) value.second); }
/** * Fuzzy right align alleles allowing for mismatches and indels defined by penalty. * * @chrom - chromosome * @pos1 - 1 based position * @ref - reference sequence * @alt - alternative sequence * @penalty - mismatch/indels allowed * * Returns right aligned position. */ uint32_t CandidateRegionExtractor::fuzzy_right_align(const char* chrom, int32_t pos1, std::string ref, std::string alt, uint32_t penalty) { if (ref==alt) { return pos1; } int32_t seq_len; char* seq; while (ref.at(0)==alt.at(0)) { seq = faidx_fetch_seq(fai, chrom, pos1, pos1, &seq_len); if (seq_len) { ref.erase(0,1); alt.erase(0,1); ref.push_back(seq[0]); alt.push_back(seq[0]); free(seq); ++pos1; } else { fprintf(stderr, "[%s:%d %s] Cannot read from sequence file\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } } if (penalty) { uint32_t pos1_sub = pos1; uint32_t pos1_del = pos1; uint32_t pos1_ins = pos1; //substitution seq = faidx_fetch_seq(fai, chrom, pos1, pos1, &seq_len); if (seq_len) { std::string new_ref = ref; std::string new_alt = alt; new_ref.erase(0,1); new_alt.erase(0,1); new_ref.push_back(seq[0]); new_alt.push_back(seq[0]); //std::cerr << "\tsub: " << chrom << ":" << pos1+1 << ":" << new_ref << ":" << new_alt << " (" << penalty-1 << ")\n"; pos1_sub = fuzzy_right_align(chrom, pos1+1, new_ref, new_alt, penalty-1); } else { fprintf(stderr, "[%s:%d %s] Cannot read from sequence file\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } //deletion if (ref.size()>1) { std::string new_ref = ref; new_ref.erase(0,1); //std::cerr << "\tdel: " << chrom << ":" << pos1 << ":" << new_ref << ":" << alt << " (" << penalty-1 << ")\n"; pos1_del = fuzzy_right_align(chrom, pos1, new_ref, alt, penalty-1); } //insertion if (alt.size()>1) { std::string new_alt = alt; new_alt.erase(0,1); //std::cerr << "\tins: " << chrom << ":" << pos1 << ":" << ref << ":" << new_alt << " (" << penalty-1 << ")\n"; pos1_ins = fuzzy_right_align(chrom, pos1, ref, new_alt, penalty-1); } pos1 = std::max(pos1_sub, std::max(pos1_del, pos1_ins)); } return pos1; }
/** * Extract reference sequence region for motif discovery. * * The input is a VCF record that contains an indel. * * If the the indel has multiple alleles, it will examine all * alleles. * * todo: is might be a good idea to combine this step with motif detection * since there seems to be a need to have an iterative process here * to ensure a good candidate motif is chosen. * */ void CandidateRegionExtractor::extract_regions_by_exact_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { if (debug) std::cerr << "********************************************\n"; std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n"; } VNTR& vntr = variant.vntr; const char* chrom = bcf_get_chrom(h, v); int32_t min_beg1 = bcf_get_pos1(v); int32_t max_end1 = min_beg1; if (debug) { bcf_print_liten(h, v); } //merge candidate search region for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); //this prevents introduction of flanks that do not harbour the repeat unit trim(pos1, ref, alt); int32_t end1 = pos1 + ref.size() - 1; right_align(chrom, end1, ref, alt); int32_t beg1 = end1 - ref.size() + 1; left_align(chrom, beg1, ref, alt); min_beg1 = beg1<min_beg1 ? beg1 : min_beg1; max_end1 = end1>max_end1 ? end1 : max_end1; int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n"; std::cerr << " " << seq << "\n"; } if (seq_len) free(seq); } int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } vntr.exact_repeat_tract = seq; vntr.rid = bcf_get_rid(v); vntr.exact_rbeg1 = min_beg1; vntr.exact_rend1 = max_end1; if (seq_len) free(seq); }
/* * Performs pileup * @param conf configuration for this pileup * @param n number of files specified in fn * @param fn filenames */ static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_hdr_t *h = NULL; /* header of first file in input list */ char *ref; void *rghash = NULL; FILE *pileup_fp = NULL; bcf_callaux_t *bca = NULL; bcf_callret1_t *bcr = NULL; bcf_call_t bc; htsFile *bcf_fp = NULL; bcf_hdr_t *bcf_hdr = NULL; bam_sample_t *sm = NULL; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(mplp_aux_t*)); plp = calloc(n, sizeof(bam_pileup1_t*)); n_plp = calloc(n, sizeof(int)); sm = bam_smpl_init(); if (n == 0) { fprintf(stderr,"[%s] no input file/data given\n", __func__); exit(1); } // read the header of each file in the list and initialize data for (i = 0; i < n; ++i) { bam_hdr_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = sam_open(fn[i], "rb"); if ( !data[i]->fp ) { fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(1); } hts_set_fai_filename(data[i]->fp, conf->fai_fname); data[i]->conf = conf; h_tmp = sam_hdr_read(data[i]->fp); if ( !h_tmp ) { fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(1); } data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(1); } if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg); exit(1); } if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; hts_idx_destroy(idx); } if (i == 0) h = h_tmp; /* save the header of first file in list */ else { // FIXME: to check consistency bam_hdr_destroy(h_tmp); } } // allocate data storage proportionate to number of samples being studied sm->n gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_BCF) { const char *mode; if ( conf->flag & MPLP_VCF ) mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz"; // uncompressed VCF or compressed VCF else mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb"; // uncompressed BCF or compressed BCF bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); if (bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(1); } bcf_hdr = bcf_hdr_init("w"); kstring_t str = {0,0,0}; ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version()); bcf_hdr_append(bcf_hdr, str.s); str.l = 0; ksprintf(&str, "##samtoolsCommand=samtools mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&str, " %s", conf->argv[i]); kputc('\n', &str); bcf_hdr_append(bcf_hdr, str.s); if (conf->fai_fname) { str.l = 0; ksprintf(&str, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(bcf_hdr, str.s); } // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<h->n_targets; i++) { str.l = 0; ksprintf(&str, "##contig=<ID=%s,length=%d>", h->target_name[i], h->target_len[i]); bcf_hdr_append(bcf_hdr, str.s); } free(str.s); bcf_hdr_append(bcf_hdr,"##ALT=<ID=X,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); for (i=0; i<sm->n; i++) bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); bcf_hdr_add_sample(bcf_hdr, NULL); bcf_hdr_write(bcf_fp, bcf_hdr); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; bc.bcf_hdr = bcf_hdr; bc.n = sm->n; bc.PL = malloc(15 * sm->n * sizeof(*bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4); bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample bc.DPR = malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<sm->n; i++) bcr[i].DPR = bc.DPR + (i+1)*B2B_MAX_ALLELES; } } } else { pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout; if (pileup_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); exit(1); } } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; // begin pileup iter = bam_mplp_init(n, mplp_func, (void**)data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_BCF) { int total_depth, _ref0, ref16; for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = seq_nt16_table[_ref0]; bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bc.tid = tid; bc.pos = pos; bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); } } } else { fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j, cnt; for (j = cnt = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; } fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); } else { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_MAPQ) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if ( c < conf->min_baseQ ) continue; c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_POS) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putc(',', pileup_fp); fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putc('\n', pileup_fp); } } // clean up free(bc.tmp.s); bcf_destroy1(bcf_rec); if (bcf_fp) { hts_close(bcf_fp); bcf_hdr_destroy(bcf_hdr); bcf_call_destroy(bca); free(bc.PL); free(bc.DP4); free(bc.DPR); free(bc.fmt_arr); free(bcr); } if (pileup_fp && conf->output_fname) fclose(pileup_fp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bam_mplp_destroy(iter); bam_hdr_destroy(h); for (i = 0; i < n; ++i) { sam_close(data[i]->fp); if (data[i]->iter) hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return ret; }
/** * Detect repeat region. */ void FlankDetector::detect_flanks(bcf_hdr_t* h, bcf1_t *v, Variant& variant, uint32_t mode) { VNTR& vntr = variant.vntr; //simple single base pair clipping of ends if (mode==CLIP_ENDS) { if (debug) { std::cerr << "********************************************\n"; std::cerr << "CLIP ENDS\n"; std:: cerr << "\n"; } //in the case of simple indels, it is guaranteed that repeat tract length //is always greater than 2 after exact alignment for deletions. This is //because at least one end of the REF and ALT will end or start in the same base, //and this allows the exact alignment detection of the repeat region to always //increase by at least 1 base. // //exceptions: // // simple insertion // C[G]C => [GC] // GA // // complex substitution // C[G]C => [G] // TA if (vntr.exact_repeat_tract.size()>2) { vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(1, vntr.exact_repeat_tract.size()-2); ++vntr.exact_rbeg1; } vntr.ru = choose_repeat_unit(vntr.exact_repeat_tract, vntr.motif); // std::cerr << "repeat tract : " vntr.exact_rend1 = vntr.exact_rbeg1+vntr.exact_rl-1; ahmm->set_model(vntr.ru.c_str()); ahmm->align(vntr.exact_repeat_tract.c_str(), qual.c_str()); vntr.exact_motif_concordance = ahmm->motif_concordance; vntr.exact_no_exact_ru = ahmm->exact_motif_count; vntr.exact_total_no_ru = ahmm->motif_count; vntr.exact_rl = ahmm->repeat_tract_len; if (debug) { vntr.print(); } } else if (mode==FRAHMM) { if (debug) { std::cerr << "********************************************\n"; std::cerr << "DETECTING REPEAT TRACT FUZZILY\n"; } ///////////////// //exact alignment ///////////////// if (debug) { std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n"; std::cerr << "Exact left/right alignment\n"; } if (vntr.exact_repeat_tract.size()>2) { //removing the anchor bases if (vntr.mlen==1) { int32_t offset = 0; int32_t length = vntr.exact_repeat_tract.size(); if (vntr.exact_repeat_tract.at(0)!=vntr.motif.at(0)) { offset = 1; ++vntr.exact_rbeg1; } if (vntr.exact_repeat_tract.at(vntr.exact_repeat_tract.size()-1)!=vntr.motif.at(0)) { length -= offset+1; --vntr.exact_rend1; } vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(offset, length); } else { if (vntr.exact_repeat_tract.size()>=3) { vntr.exact_repeat_tract = vntr.exact_repeat_tract.substr(1, vntr.exact_repeat_tract.size()-2); ++vntr.exact_rbeg1; --vntr.exact_rend1; } } } //this is for nonexistent repeat units // // RU : T // repeat_tract : G[T]C where T is an insert else if (vntr.exact_repeat_tract.size()==2) { } vntr.ru = choose_repeat_unit(vntr.exact_repeat_tract, vntr.motif); ahmm->set_model(vntr.ru.c_str()); ahmm->align(vntr.exact_repeat_tract.c_str(), qual.c_str()); vntr.exact_motif_concordance = ahmm->motif_concordance; vntr.exact_no_exact_ru = ahmm->exact_motif_count; vntr.exact_total_no_ru = ahmm->motif_count; vntr.exact_rl = ahmm->repeat_tract_len; if (debug) { std::cerr << "\n"; std::cerr << "repeat_tract : " << vntr.exact_repeat_tract << "\n"; std::cerr << "position : [" << vntr.exact_rbeg1 << "," << vntr.exact_rend1 << "]\n"; std::cerr << "motif_concordance : " << vntr.exact_motif_concordance << "\n"; std::cerr << "repeat units : " << vntr.exact_rl << "\n"; std::cerr << "exact repeat units : " << vntr.exact_no_exact_ru << "\n"; std::cerr << "total no. of repeat units : " << vntr.exact_total_no_ru << "\n"; std::cerr << "\n"; } /////////////////////// //fuzzy right alignment /////////////////////// if (debug) { std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n"; std::cerr << "Fuzzy right alignment\n"; } int32_t slen = 100; char* rflank; int32_t rflank_len; char* lflank; int32_t lflank_len; int32_t lflank_end1; int32_t rflank_beg1; char* seq; int32_t seq_len; // bool encountered_N = false; while (true) { //pick 5 bases to the right rflank = faidx_fetch_seq(fai, variant.chrom.c_str(), vntr.exact_rend1+1-1, vntr.exact_rend1+5-1, &rflank_len); //pick 105 bases for aligning seq = faidx_fetch_seq(fai, variant.chrom.c_str(), vntr.exact_rend1-slen-1, vntr.exact_rend1+5-1, &seq_len); rfhmm->set_model(vntr.ru.c_str(), rflank); rfhmm->align(seq, qual.c_str()); if (debug) rfhmm->print_alignment(); if (rflank_len) free(rflank); if (seq_len) free(seq); ////////////////////// //fuzzy left alignment ////////////////////// if (debug) { std::cerr << "\n"; std::cerr << "++++++++++++++++++++++++++++++++++++++++++++\n"; std::cerr << "Fuzzy left alignment\n"; } //this is a hack around rfhmm rigidity in modeling the RUs //todo: we should change this to a reverse version of LFHMM!!!! if (rfhmm->get_lflank_read_epos1()>2*vntr.ru.size()) { lflank_end1 = vntr.exact_rend1-slen-1+1 + rfhmm->get_lflank_read_epos1() - 1; break; } else if (slen==1000) { lflank_end1 = vntr.exact_rend1 - 1000 - 1; vntr.is_large_repeat_tract = true; break; } else { slen +=100; if (debug) std::cerr << "extending the reference sequence for RFHMM : " << slen << "\n"; } } slen = 100; //pick 5 bases to right while(true) { lflank = faidx_fetch_seq(fai, variant.chrom.c_str(), lflank_end1-5-1, lflank_end1-1, &lflank_len); //pick 105 bases for aligning seq = faidx_fetch_seq(fai, variant.chrom.c_str(), lflank_end1-5-1, lflank_end1+slen-1-1, &seq_len); lfhmm->set_model(lflank, vntr.ru.c_str()); lfhmm->align(seq, qual.c_str()); if (debug) lfhmm->print_alignment(); if (seq_len) free(seq); if (lfhmm->get_rflank_read_epos1()!=INT32_MAX) { rflank_beg1 = lflank_end1 - 5 + lfhmm->get_rflank_read_spos1() -1; break; } else if (slen==1000) { rflank_beg1 = lflank_end1 + 1000; vntr.is_large_repeat_tract = true; break; } else { slen +=100; if (debug) std::cerr << "extending the reference sequence for LFHMM : " << slen << "\n"; } } if (lflank_len) free(lflank); lflank = faidx_fetch_seq(fai, variant.chrom.c_str(), lflank_end1-10-1, lflank_end1-1, &lflank_len); rflank = faidx_fetch_seq(fai, variant.chrom.c_str(), rflank_beg1-1, rflank_beg1 +10 -1 -1, &rflank_len); vntr.fuzzy_rbeg1 = lflank_end1+1; vntr.fuzzy_rend1 = rflank_beg1-1; int32_t repeat_tract_len; char* repeat_tract = faidx_fetch_seq(fai, variant.chrom.c_str(), lflank_end1, rflank_beg1-1-1, &repeat_tract_len); vntr.fuzzy_repeat_tract.assign(repeat_tract); if (repeat_tract_len) free(repeat_tract); vntr.fuzzy_motif_concordance = lfhmm->motif_concordance; vntr.fuzzy_no_exact_ru = lfhmm->exact_motif_count; vntr.fuzzy_total_no_ru = lfhmm->motif_count; vntr.fuzzy_rl = rflank_beg1-lflank_end1-1; if (lflank_len) free(lflank); if (rflank_len) free(rflank); if (debug) { std::cerr << "\n"; vntr.print(); std::cerr << "\n"; } } //fill in flanks const char* chrom = variant.chrom.c_str(); uint32_t pos1 = vntr.exact_rbeg1; int32_t len = 0; faidx_fetch_seq(fai, chrom, pos1-10, pos1-1, &len); };
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); int storeSize = 100; int delStore[2][100] = {{0},{0}}; typedef char * mstring; while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { //MDW start //for each position in the pileup column int charLen = 16; int countChars[ charLen ][2]; int countiChars[ charLen ][2]; int countGap[2]={0,0}; //double qvTotal=0; int numStruck=0; int numGood=0; int tti; int ttj; mstring insAllele[100]; int insAlleleCnt[100]; int sf=0; int flag=0; //typedef char * string; char insStr0[10000]; int iCnt0=0; char insStr1[10000]; int iCnt1=0; char delStr0[10000]; int dCnt0=0; char delStr1[10000]; int dCnt1=0; float qposP[10000]; int qposCnt=0; //initialize with zeros for(tti=0;tti<charLen;tti++){ countChars[tti][0]=0; countChars[tti][1]=0; } // define repeat length here; look back up to 10 prior positions // start one position away. int replC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos-1])==toupper(ref[pos-tti])){ replC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int reprC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos+1])==toupper(ref[pos+tti])){ reprC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int repT = replC; if(replC < reprC){ repT=reprC; } for (j = 0; j < n_plp[i]; ++j){ const bam_pileup1_t *p = plp[i] + j; /* SAME LOGIC AS pileup_seq() */ if(p->is_refskip){ // never count intron gaps in numStruck continue; } if(p->is_del){ // skip deletion gap, after first position which is the first aligned char continue; } if( p->b->core.qual < conf->min_mqToCount || // mapping quality conf->maxrepC < (repT) || // max homopolymer run, this will not (!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) || // trimEnd is 1-based p->zf == 1 || // fusion tag p->ih > conf->maxIH || // max hit index (p->nmd > conf->maxNM) || // max mismatch (conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs (conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary (conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup (conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY)) || // optionally strike secondary or dup (conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) )) // optionally strike secondary, dup and QCfail ){ numStruck++; continue; } //printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]); if(!p->is_del && p->indel==0){ countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++; numGood++; }else if(p->is_refskip){ countGap[ bam1_strand(p->b) ]++; } if(p->indel<0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr0[dCnt0] = ref[pos+tti]; dCnt0++; } delStr0[dCnt0] = ','; dCnt0++; }else{ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr1[dCnt1] = ref[pos+tti]; dCnt1++; } delStr1[dCnt1] = ','; dCnt1++; } }else if(p->indel>0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt0++; } insStr0[iCnt0] = ','; iCnt0++; }else{ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt1++; } insStr1[iCnt1] = ','; iCnt1++; } } //calculate position of variant within aligned read - no soft clips if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0 ){ //distance to end; calculate distance to end of aligned read. removes soft clips. int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd; qposP[qposCnt] = distToEnd; qposCnt++; // printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd); } } // //print A,C,G,T, by +/- printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", countChars[1][0],countChars[1][1], countChars[2][0],countChars[2][1], countChars[4][0],countChars[4][1], countChars[8][0],countChars[8][1], countChars[7][0],countChars[7][1]); putchar('\t'); for(tti=0;tti<dCnt0;tti++){ putchar(delStr0[tti]); } putchar('\t'); for(tti=0;tti<dCnt1;tti++){ putchar(delStr1[tti]); } putchar('\t'); for(tti=0;tti<iCnt0;tti++){ putchar(insStr0[tti]); } putchar('\t'); for(tti=0;tti<iCnt1;tti++){ putchar(insStr1[tti]); } printf("\t%d\t%d",numGood,numStruck); // get non-ref qpos variation float medqpos = -1; float medAbsDev = -1; if(qposCnt>0){ medqpos = median(qposCnt,qposP); float absDev[qposCnt]; for(tti=0;tti<qposCnt;tti++){ absDev[tti] = abs(medqpos - qposP[tti]); } medAbsDev = median(qposCnt-1,absDev); } printf("\t%f",medAbsDev); ///END MDW } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
void extractCalls(Config *config) { bam_hdr_t *hdr = sam_hdr_read(config->fp); bam_mplp_t iter; int ret, tid, pos, i, seqlen, type, rv, o = 0; int beg0 = 0, end0 = 1u<<29; int n_plp; //This will need to be modified for multiple input files int ctid = -1; //The tid of the contig whose sequence is stored in "seq" int idxBED = 0, strand; uint32_t nmethyl = 0, nunmethyl = 0; const bam_pileup1_t **plp = NULL; char *seq = NULL, base; mplp_data *data = NULL; struct lastCall *lastCpG = NULL; struct lastCall *lastCHG = NULL; if(config->merge) { if(config->keepCpG) { lastCpG = calloc(1, sizeof(struct lastCall)); assert(lastCpG); lastCpG->tid = -1; } if(config->keepCHG) { lastCHG = calloc(1, sizeof(struct lastCall)); assert(lastCHG); lastCHG->tid = -1; } } data = calloc(1,sizeof(mplp_data)); if(data == NULL) { fprintf(stderr, "Couldn't allocate space for the data structure in extractCalls()!\n"); return; } data->config = config; data->hdr = hdr; if (config->reg) { if((data->iter = sam_itr_querys(config->bai, hdr, config->reg)) == 0) { fprintf(stderr, "failed to parse regions %s", config->reg); return; } } if(config->bedName) { config->bed = parseBED(config->bedName, hdr); if(config->bed == NULL) return; } plp = calloc(1, sizeof(bam_pileup1_t *)); //This will have to be modified for multiple input files if(plp == NULL) { fprintf(stderr, "Couldn't allocate space for the plp structure in extractCalls()!\n"); return; } //Start the pileup iter = bam_mplp_init(1, filter_func, (void **) &data); bam_mplp_init_overlaps(iter); bam_mplp_set_maxcnt(iter, config->maxDepth); while((ret = cust_mplp_auto(iter, &tid, &pos, &n_plp, plp)) > 0) { //Do we need to process this position? if (config->reg) { beg0 = data->iter->beg, end0 = data->iter->end; if ((pos < beg0 || pos >= end0)) continue; // out of the region requested } if(tid != ctid) { if(seq != NULL) free(seq); seq = faidx_fetch_seq(config->fai, hdr->target_name[tid], 0, faidx_seq_len(config->fai, hdr->target_name[tid]), &seqlen); if(seqlen < 0) { fprintf(stderr, "faidx_fetch_seq returned %i while trying to fetch the sequence for tid %i (%s)!\n",\ seqlen, tid, hdr->target_name[tid]); fprintf(stderr, "Note that the output will be truncated!\n"); return; } ctid = tid; } if(config->bed) { //Handle -l while((o = posOverlapsBED(tid, pos, config->bed, idxBED)) == -1) idxBED++; if(o == 0) continue; //Wrong strand } if(isCpG(seq, pos, seqlen)) { if(!config->keepCpG) continue; type = 0; } else if(isCHG(seq, pos, seqlen)) { if(!config->keepCHG) continue; type = 1; } else if(isCHH(seq, pos, seqlen)) { if(!config->keepCHH) continue; type = 2; } else { continue; } nmethyl = nunmethyl = 0; base = *(seq+pos); for(i=0; i<n_plp; i++) { if(plp[0][i].is_del) continue; if(plp[0][i].is_refskip) continue; if(config->bed) if(!readStrandOverlapsBED(plp[0][i].b, config->bed->region[idxBED])) continue; strand = getStrand((plp[0]+i)->b); if(strand & 1) { if(base != 'C' && base != 'c') continue; } else { if(base != 'G' && base != 'g') continue; } rv = updateMetrics(config, plp[0]+i); if(rv > 0) nmethyl++; else if(rv<0) nunmethyl++; } if(nmethyl+nunmethyl==0) continue; if(!config->merge || type==2) { writeCall(config->output_fp[type], config, hdr->target_name[tid], pos, 1, nmethyl, nunmethyl); } else { //Merge into per-CpG/CHG metrics if(type==0) { if(base=='G' || base=='g') pos--; processLast(config->output_fp[0], config, lastCpG, hdr, tid, pos, 2, nmethyl, nunmethyl); } else { if(base=='G' || base=='g') pos-=2; processLast(config->output_fp[1], config, lastCHG, hdr, tid, pos, 3, nmethyl, nunmethyl); } } } //Don't forget the last CpG/CHG if(config->merge) { if(config->keepCpG && lastCpG->tid != -1) { processLast(config->output_fp[0], config, lastCpG, hdr, tid, pos, 2, nmethyl, nunmethyl); } if(config->keepCHG && lastCHG->tid != -1) { processLast(config->output_fp[1], config, lastCHG, hdr, tid, pos, 3, nmethyl, nunmethyl); } } bam_hdr_destroy(hdr); if(data->iter) hts_itr_destroy(data->iter); bam_mplp_destroy(iter); free(data); free(plp); if(seq != NULL) free(seq); }
int main_ld(int argc, char *argv[]) { int chr; //! chromosome identifier int beg; //! beginning coordinate for analysis int end; //! end coordinate for analysis int ref; //! ref long num_windows; //! number of windows std::string msg; //! string for error message bam_plbuf_t *buf; //! pileup buffer ldData t; // parse the command line options std::string region = t.parseCommandLine(argc, argv); // check input BAM file for errors t.checkBAM(); // initialize the sample data structure t.bam_smpl_init(); // add samples t.bam_smpl_add(); // initialize error model t.em = errmod_init(1.0-0.83); // parse genomic region int k = bam_parse_region(t.h, region, &chr, &beg, &end); if (k < 0) { msg = "Bad genome coordinates: " + region; fatal_error(msg, __FILE__, __LINE__, 0); } // fetch reference sequence t.ref_base = faidx_fetch_seq(t.fai_file, t.h->target_name[chr], 0, 0x7fffffff, &(t.len)); // calculate the number of windows if (t.flag & BAM_WINDOW) num_windows = ((end-beg)-1)/t.win_size; else { t.win_size = (end-beg); num_windows = 1; } // iterate through all windows along specified genomic region for (long cw=0; cw < num_windows; cw++) { // construct genome coordinate string std::string scaffold_name(t.h->target_name[chr]); std::ostringstream winc(scaffold_name); winc.seekp(0, std::ios::end); winc << ":" << beg+(cw*t.win_size)+1 << "-" << ((cw+1)*t.win_size)+(beg-1); std::string winCoord = winc.str(); // initialize number of sites to zero t.num_sites = 0; // parse the BAM file and check if region is retrieved from the reference if (t.flag & BAM_WINDOW) { k = bam_parse_region(t.h, winCoord, &ref, &(t.beg), &(t.end)); if (k < 0) { msg = "Bad window coordinates " + winCoord; fatal_error(msg, __FILE__, __LINE__, 0); } } else { ref = chr; t.beg = beg; t.end = end; if (ref < 0) { msg = "Bad scaffold name: " + region; fatal_error(msg, __FILE__, __LINE__, 0); } } // initialize nucdiv variables t.init_ld(); // create population assignments t.assign_pops(); // initialize pileup buf = bam_plbuf_init(make_ld, &t); // fetch region from bam file if ((bam_fetch(t.bam_in->x.bam, t.idx, ref, t.beg, t.end, buf, fetch_func)) < 0) { msg = "Failed to retrieve region " + region + " due to corrupted BAM index file"; fatal_error(msg, __FILE__, __LINE__, 0); } // finalize pileup bam_plbuf_push(0, buf); // calculate linkage disequilibrium statistics ld_func fp[3] = {&ldData::calc_zns, &ldData::calc_omegamax, &ldData::calc_wall}; (t.*fp[t.output])(); // print results to stdout t.print_ld(chr); // take out the garbage t.destroy_ld(); bam_plbuf_destroy(buf); } // end of window interation errmod_destroy(t.em); samclose(t.bam_in); bam_index_destroy(t.idx); t.bam_smpl_destroy(); free(t.ref_base); return 0; }
int main(int argc, char **argv) { htsFile *in = NULL; htsFile *out = NULL; char *in_name = "-"; char *out_name = "-"; char *ref_name = NULL; char *ref_seq = NULL; char modew[8] = "w"; faidx_t *fai = NULL; bam_hdr_t *hdr = NULL; bam1_t *rec = NULL; int c, res, last_ref = -1, ref_len = 0; int adjust = 0, extended = 0, recalc = 0, flags = 0; while ((c = getopt(argc, argv, "aef:hi:o:r")) >= 0) { switch (c) { case 'a': adjust = 1; break; case 'e': extended = 1; break; case 'f': ref_name = optarg; break; case 'h': usage(argv[0]); return EXIT_SUCCESS; case 'i': in_name = optarg; break; case 'o': out_name = optarg; break; case 'r': recalc = 1; break; default: usage(argv[0]); return EXIT_FAILURE; } } if (!ref_name) { usage(argv[0]); return EXIT_FAILURE; } flags = (adjust ? 1 : 0) | (extended ? 2 : 0) | (recalc ? 4 : 0); fai = fai_load(ref_name); if (!fai) { fprintf(stderr, "Couldn't load reference %s\n", ref_name); goto fail; } rec = bam_init1(); if (!rec) { perror(NULL); goto fail; } in = hts_open(in_name, "r"); if (!in) { fprintf(stderr, "Couldn't open %s : %s\n", in_name, strerror(errno)); goto fail; } hdr = sam_hdr_read(in); if (!hdr) { fprintf(stderr, "Couldn't read header for %s\n", in_name); goto fail; } out = hts_open(out_name, modew); if (!out) { fprintf(stderr, "Couldn't open %s : %s\n", out_name, strerror(errno)); goto fail; } if (sam_hdr_write(out, hdr) < 0) { fprintf(stderr, "Couldn't write header to %s : %s\n", out_name, strerror(errno)); goto fail; } while ((res = sam_read1(in, hdr, rec)) >= 0) { if (rec->core.tid >= hdr->n_targets) { fprintf(stderr, "Invalid BAM reference id %d\n", rec->core.tid); goto fail; } if (last_ref != rec->core.tid && rec->core.tid >= 0) { free(ref_seq); ref_seq = faidx_fetch_seq(fai, hdr->target_name[rec->core.tid], 0, INT_MAX, &ref_len); if (!ref_seq) { fprintf(stderr, "Couldn't get reference %s\n", hdr->target_name[rec->core.tid]); goto fail; } last_ref = rec->core.tid; } if (rec->core.tid >= 0) { res = sam_prob_realn(rec, ref_seq, ref_len, flags); if (res <= -4) { fprintf(stderr, "Error running sam_prob_realn : %s\n", strerror(errno)); goto fail; } } if (sam_write1(out, hdr, rec) < 0) { fprintf(stderr, "Error writing to %s\n", out_name); goto fail; } } res = hts_close(in); in = NULL; if (res < 0) { fprintf(stderr, "Error closing %s\n", in_name); goto fail; } res = hts_close(out); out = NULL; if (res < 0) { fprintf(stderr, "Error closing %s\n", out_name); goto fail; } bam_hdr_destroy(hdr); bam_destroy1(rec); free(ref_seq); fai_destroy(fai); return EXIT_SUCCESS; fail: if (hdr) bam_hdr_destroy(hdr); if (rec) bam_destroy1(rec); if (in) hts_close(in); if (out) hts_close(out); free(ref_seq); fai_destroy(fai); return EXIT_FAILURE; }