static void copy_check_alignment(const char *infname, const char *informat, const char *outfname, const char *outmode, const char *outref) { samFile *in = sam_open(infname, "r"); samFile *out = sam_open(outfname, outmode); bam1_t *aln = bam_init1(); bam_hdr_t *header = NULL; int res; if (!in) { fail("couldn't open %s", infname); goto err; } if (!out) { fail("couldn't open %s with mode %s", outfname, outmode); goto err; } if (!aln) { fail("bam_init1() failed"); goto err; } if (outref) { if (hts_set_opt(out, CRAM_OPT_REFERENCE, outref) < 0) { fail("setting reference %s for %s", outref, outfname); goto err; } } header = sam_hdr_read(in); if (!header) { fail("reading header from %s", infname); goto err; } if (sam_hdr_write(out, header) < 0) fail("writing headers to %s", outfname); while ((res = sam_read1(in, header, aln)) >= 0) { int mod4 = ((intptr_t) bam_get_cigar(aln)) % 4; if (mod4 != 0) fail("%s CIGAR not 4-byte aligned; offset is 4k+%d for \"%s\"", informat, mod4, bam_get_qname(aln)); if (sam_write1(out, header, aln) < 0) fail("writing to %s", outfname); } if (res < -1) { fail("failed to read alignment from %s", infname); } err: bam_destroy1(aln); bam_hdr_destroy(header); if (in) sam_close(in); if (out) sam_close(out); }
/* * This function calculates ct tag for two bams, it assumes they are from the same template and * writes the tag to the first read in position terms. */ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) { bam1_t *swap; int i, end; uint32_t *cigar; str->l = 0; if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam_get_cigar(b1); i < b1->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } end = bam_endpos(b1); kputw(b2->core.pos - end, str); kputc('T', str); kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam_get_cigar(b2); i < b2->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } uint8_t* data; if ((data = bam_aux_get(b1,"ct")) != NULL) bam_aux_del(b1, data); if ((data = bam_aux_get(b2,"ct")) != NULL) bam_aux_del(b2, data); bam_aux_append(b1, "ct", 'Z', str->l+1, (uint8_t*)str->s); }
/** * Gets the cigar from a BAM record */ void bam_get_cigar_string(bam1_t *s, kstring_t *cigar_string) { cigar_string->l=0; int32_t n_cigar_op = bam_get_n_cigar_op(s); if (n_cigar_op) { uint32_t *cigar = bam_get_cigar(s); for (int32_t i = 0; i < n_cigar_op; ++i) { kputw(bam_cigar_oplen(cigar[i]), cigar_string); kputc(bam_cigar_opchr(cigar[i]), cigar_string); } } }
// This function reads a BAM alignment from one BAM file. static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret; while (1) { ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); if ( ret<0 ) break; if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; if ( (int)b->core.qual < aux->min_mapQ ) continue; if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; break; } return ret; }
// Returns 0 on success, -1 on failure. static int bam_format_cigar(const bam1_t* b, kstring_t* str) { // An empty cigar is a special case return "*" rather than "" if (b->core.n_cigar == 0) { return (kputc('*', str) == EOF) ? -1 : 0; } const uint32_t *cigar = bam_get_cigar(b); uint32_t i; for (i = 0; i < b->core.n_cigar; ++i) { if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1; if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1; } return 0; }
int read_bam(void *data, bam1_t *b) { aux_t *aux = (aux_t*)data; int ret; while(1) { uint8_t *tmp = 0; ret = sam_read1(aux->fp, aux->hdr, b); if (ret < 0) break; if (b->core.flag & (BAM_FUNMAP)) continue; if ((int)b->core.qual < aux->min_mapq) continue; if (bam_cigar2ulen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len) continue; tmp = bam_aux_get(b, "AS"); if (tmp && bam_aux2i(tmp) < aux->min_as) continue; break; } return ret; }
static int32_t unclipped_start(bam1_t *b) { uint32_t *cigar = bam_get_cigar(b); int32_t clipped = 0; uint32_t i; for (i = 0; i < b->core.n_cigar; i++) { char c = bam_cigar_opchr(cigar[i]); if (c == 'S' || c == 'H') { // clips clipped += bam_cigar_oplen(cigar[i]); } else { break; } } return b->core.pos - clipped + 1; }
/* Finds InDels in a BAM or CRAM file, adding them to the linked list * * fp Input BAM/CRAM file * hdr The header for the BAM/CRAM file * k The K-mer size * * discussion The linked list will need to be destroyed with destroyNodes() */ void findInDels(htsFile *fp, bam_hdr_t *hdr, int minMAPQ, int k) { bam1_t *b = bam_init1(); int i, op; InDel *node; uint32_t *cigar; while(sam_read1(fp, hdr, b) > 0) { if(b->core.qual < minMAPQ) continue; cigar = bam_get_cigar(b); for(i=0; i<b->core.n_cigar; i++) { op = bam_cigar_op(cigar[i]); if(op == 1 || op == 2) { node = makeNode(b, i); if(node == NULL) goto quit; insertNode(node, k); while(++i < b->core.n_cigar) { //Skip adjacent D/I operations op = bam_cigar_op(cigar[i]); if(op != 1 && op != 2) break; continue; } } } } //Ensure that all ROIs are at least k apart lastTargetNode = firstTargetNode->next; while(lastTargetNode->next) { i = TargetNodeCmp(lastTargetNode,lastTargetNode->next, k); assert(i<=0); if(i==0) { lastTargetNode->end = lastTargetNode->next->end; lastTargetNode->count += (lastTargetNode->count+lastTargetNode->next->count > lastTargetNode->count)?lastTargetNode->next->count:0xFFFFFFFF; removeNode(lastTargetNode->next); } else { lastTargetNode = lastTargetNode->next; } } quit: bam_destroy1(b); }
// Returns 0 to indicate read should be output 1 otherwise static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) { if (settings->remove_B) bam_remove_B(b); if (settings->min_qlen > 0) { int k, qlen = 0; uint32_t *cigar = bam_get_cigar(b); for (k = 0; k < b->core.n_cigar; ++k) if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) qlen += bam_cigar_oplen(cigar[k]); if (qlen < settings->min_qlen) return 1; } if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off)) return 1; if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); if ((double)(k&0xffffff) / 0x1000000 >= settings->subsam_frac) return 1; } if (settings->rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1)); if (k == kh_end(settings->rghash)) return 1; } } if (settings->library) { const char *p = bam_get_library((bam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; } if (settings->remove_aux_len) { size_t i; for (i = 0; i < settings->remove_aux_len; ++i) { uint8_t *s = bam_aux_get(b, settings->remove_aux[i]); if (s) { bam_aux_del(b, s); } } } return 0; }
static int unpad_seq(bam1_t *b, kstring_t *s) { // Returns 0 on success, -1 on an error int k, j, i; int length; int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */ uint32_t *cigar = bam_get_cigar(b); uint8_t *seq = bam_get_seq(b); // b->core.l_qseq gives length of the SEQ entry (including soft clips, S) // We need the padded length after alignment from the CIGAR (excluding // soft clips S, but including pads from CIGAR D operations) length = bam_cigar2rlen(b->core.n_cigar, cigar); ks_resize(s, length); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j); } else if (op == BAM_CSOFT_CLIP) { j += ol; } else if (op == BAM_CHARD_CLIP) { /* do nothing */ } else if (op == BAM_CDEL) { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } else if (op == BAM_CREF_SKIP) { /* Treat CIGAR N as D (not ideal, but better than ignoring it) */ for (i = 0; i < ol; ++i) s->s[s->l++] = 0; if (0 == cigar_n_warning) { cigar_n_warning = -1; fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); } } else { fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); return -1; } } return length != s->l; }
void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) { double w; if (no_fractional_weight) w = 1.0; else { uint8_t *p_tag = bam_aux_get(b, "ZW"); if (p_tag == NULL) return; w = bam_aux2f(p_tag); } int pos = b->core.pos; uint32_t *p = bam_get_cigar(b); for (int i = 0; i < (int)b->core.n_cigar; ++i, ++p) { char op = bam_cigar_op(*p); int op_len = bam_cigar_oplen(*p); if (op == BAM_CMATCH) for (int j = 0; j < op_len; ++j, ++pos) wiggle.read_depth[pos] += w; else pos += ((bam_cigar_type(op) & 2) ? op_len : 0); } }
static int32_t unclipped_end(bam1_t *b) { uint32_t *cigar = bam_get_cigar(b); int32_t end_pos, clipped = 0; int32_t i; end_pos = bam_endpos(b); // now get the clipped end bases (if any) // if we get to the beginning of the cigar string // without hitting a non-clip then the results are meaningless for (i = b->core.n_cigar - 1; i >= 0; i--) { char c = bam_cigar_opchr(cigar[i]); if (c == 'S' || c == 'H') { // clips clipped += bam_cigar_oplen(cigar[i]); } else { break; } } return end_pos + clipped; }
Mapping(const bam_hdr_t * hdr_p, bam1_t * rec_p) : _rec_p(rec_p) { _query_name = bam_get_qname(rec_p); _flag = rec_p->core.flag; for (int i = 0; i < rec_p->core.l_qseq; ++i) { _seq += seq_nt16_str[bam_seqi(bam_get_seq(rec_p), i)]; } if (is_mapped()) { _chr_name = hdr_p->target_name[rec_p->core.tid]; _rf_start = rec_p->core.pos; _cigar = Cigar(bam_get_cigar(rec_p), rec_p->core.n_cigar); _rf_len = _cigar.rf_len(); } if (is_paired() and mp_is_mapped()) { _mp_chr_name = hdr_p->target_name[rec_p->core.mtid]; _mp_rf_start = rec_p->core.mpos; } }
/* Make a node containing an InDel * * b The input read * cigar_op_num The operation number of the Insertion/Deletion * * returns a node, that must be either inserted into the linked list or * destroyed with destroyNode() */ InDel *makeNode(bam1_t *b, int cigar_op_num) { int i, op, oplen, quit = 0, type; int32_t start = b->core.pos-1; int32_t end; uint32_t *cigar = bam_get_cigar(b); InDel *node; for(i=0; i<cigar_op_num; i++) { oplen = bam_cigar_oplen(cigar[i]); type = bam_cigar_type(bam_cigar_op(cigar[i])); if(type & 2) start += oplen; } end = ++start; for(i=cigar_op_num; i<b->core.n_cigar; i++) { op = bam_cigar_op(cigar[i]); oplen = bam_cigar_oplen(cigar[i]); switch(op) { case 1: //I case 2: //D end = (end>start+oplen) ? end : start+oplen; break; default : quit = 1; break; } if(quit) break; } //Make the node node = calloc(1, sizeof(InDel)); node->tid = b->core.tid; node->start = start; node->end = end; node->count = 1; return node; }
string bam_cigarString (bam1_t *b) {//output CIGAR string // kstring_t strK; // kstring_t *str=&strK; const bam1_core_t *c = &b->core; string cigarString(""); if ( c->n_cigar > 0 ) { uint32_t *cigar = bam_get_cigar(b); for (int i = 0; i < c->n_cigar; ++i) { cigarString+=to_string((uint)bam_cigar_oplen(cigar[i]))+bam_cigar_opchr(cigar[i]); }; }; // if (c->n_cigar) { // cigar // for (int i = 0; i < c->n_cigar; ++i) { // kputw(bam_cigar_oplen(cigar[i]), str); // kputc(bam_cigar_opchr(cigar[i]), str); // } // } else kputc('*', str); // // string cigarString (str->s,str->l); return cigarString; };
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; int32_t old_nm_i = -1, nm = 0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++u; } else { kputw(u, str); kputc(ref[x+j], str); u = 0; ++nm; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { kputw(u, str); kputc('^', str); for (j = 0; j < l; ++j) { if (x+j >= ref_len || ref[x+j] == '\0') break; kputc(ref[x+j], str); } u = 0; x += j; nm += j; if (j < l) break; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { y += l; if (op == BAM_CINS) nm += l; } else if (op == BAM_CREF_SKIP) { x += l; } } kputw(u, str); // apply max_nm if (max_nm > 0 && nm >= max_nm) { for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match seq[z/2] |= (z&1)? 0x0f : 0xf0; bam_get_qual(b)[z] = 0; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; } } // update NM if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_nm = bam_aux_get(b, "NM"); if (old_nm) old_nm_i = bam_aux2i(old_nm); if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); else if (nm != old_nm_i) { fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } } // update MD if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_md = bam_aux_get(b, "MD"); if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); else { int is_diff = 0; if (strlen((char*)old_md+1) == str->l) { for (i = 0; i < str->l; ++i) if (toupper(old_md[i+1]) != toupper(str->s[i])) break; if (i < str->l) is_diff = 1; } else is_diff = 1; if (is_diff) { fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } } } // drop all tags but RG if (flag&DROP_TAG) { uint8_t *q = bam_aux_get(b, "RG"); bam_aux_drop_other(b, q); } // reduce the resolution of base quality if (flag&BIN_QUAL) { uint8_t *qual = bam_get_qual(b); for (i = 0; i < b->core.l_qseq; ++i) if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; } free(str->s); free(str); }
void profileReads(char* bamFile, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { // int result = -1; int supp_check = 0x0; if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // helper variables BGZF* in = 0 ; bam1_t *b = bam_init1(); bam_hdr_t *h; // open bam if ((in = bgzf_open(bamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", bamFile); } else if ((h = bam_hdr_read(in)) == 0) { // read header fprintf(stderr, "ERROR: Failed to read BAM header of file \"%s\".\n", bamFile); } else { // destroy header bam_hdr_destroy(h); int line = 0; int supplementary, secondary; int mapQual; int matches, mismatches, qLen; float pcAln, pcId; int showStats = 0; uint8_t *aux_mismatches; // print header printf("line\tsupp\tsecondary\tmapQ\tmismatches\tmatches\tqLen\tpcId\tpcAln\n"); // fetch alignments while ((result = bam_read1(in, b)) >= 0) { line += 1; // only primary mappings if ((b->core.flag & supp_check) != 0) { if (showStats) fprintf(stdout, "Rejected %d, non-primary\n", line); continue; } supplementary = (b->core.flag & (1 | BAM_FSUPPLEMENTARY)) != 0; secondary = (b->core.flag & (1 | BAM_FSECONDARY)) != 0; // quality mapQual = b->core.qual; // bam_aux_get returns 0 if optional NM tag is missing if ((aux_mismatches = bam_aux_get(b, "NM"))) mismatches = bam_aux2i(aux_mismatches); else mismatches = 0; // length qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b)); // percent identity matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b)); pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1 // percent alignment pcAln = matches / (float)qLen; // percentage as float between 0 to 1 // print read values printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\t%.4f\n", line, supplementary, secondary, mapQual, mismatches, matches, qLen, pcId, pcAln); } if (result < -1) { fprintf(stderr, "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n", line, bamFile, result); } } if (in) bgzf_close(in); bam_destroy1(b); }
int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) { bam1_t *b = 0; kstring_t r, q; int r_tid = -1; uint32_t *cigar2 = 0; int ret = 0, n2 = 0, m2 = 0, *posmap = 0; b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; int read_ret; while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in' // Cannot depad unmapped CRAM data if (b->core.flag & BAM_FUNMAP) goto next_seq; uint32_t *cigar = bam_get_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); r_tid = b->core.tid; if (0!=unpad_seq(b, &r)) { fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); return -1; }; if (h->target_len[r_tid] != r.l) { fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); return -1; } if (fai) { // Check the embedded reference matches the FASTA file if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); return -1; } assert(r.l == q.l); int i; for (i = 0; i < r.l; ++i) { if (r.s[i] != q.s[i]) { // Show gaps as ASCII 45 fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", h->target_name[b->core.tid], i+1, r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); return -1; } } } write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = update_posmap(posmap, r); } else if (b->core.n_cigar > 0) { int i, k, op; if (b->core.tid < 0) { fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b)); return -1; } else if (b->core.tid == r_tid) { ; // good case, reference available //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); } else if (fai) { if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); r_tid = b->core.tid; // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); } else { fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); return -1; } if (0!=unpad_seq(b, &q)) { fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b)); return -1; }; if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[0]); } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) { write_cigar(cigar2, n2, m2, cigar[0]); if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[1]); } } /* Determine CIGAR operator for each base in the aligned read */ for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); /* Include any pads if starts with an insert */ if (q.s[0] == BAM_CINS) { for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD)); k = 0; } else if (q.s[0] == BAM_CPAD) { // Join 'k' CPAD to our first cigar op CPAD too. for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); } else { k = 0; } /* Count consecutive CIGAR operators to turn into a CIGAR string */ for (i = 1, k++, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) { if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]); } write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); } /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */ int pre_op, post_op; for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) { pre_op = bam_cigar_op(cigar2[i-2]); post_op = bam_cigar_op(cigar2[i]); /* Note don't need to check for X/= as code above will use M only */ if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) { /* This is a redundant P operator */ cigar2[i-1] = 0; // i.e. 0M /* If had same operator either side, combine them in post_op */ if (pre_op == post_op) { /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/ cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op); cigar2[i-2] = 0; // i.e. 0M } } } /* Remove the zero'd operators (0M) */ for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); } /* Even unmapped reads can have a POS value, e.g. if their mate was mapped */ if (b->core.pos != -1) b->core.pos = posmap[b->core.pos]; if (b->core.mtid < 0 || b->core.mpos < 0) { /* Nice case, no mate to worry about*/ // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b)); /* TODO - Warning if FLAG says mate should be mapped? */ /* Clean up funny input where mate position is given but mate reference is missing: */ b->core.mtid = -1; b->core.mpos = -1; } else if (b->core.mtid == b->core.tid) { /* Nice case, same reference */ // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b)); b->core.mpos = posmap[b->core.mpos]; } else { /* Nasty case, Must load alternative posmap */ // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); if (!fai) { fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); return -1; } /* Temporarily load the other reference sequence */ if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); return -1; } posmap = update_posmap(posmap, r); b->core.mpos = posmap[b->core.mpos]; /* Restore the reference and posmap*/ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); } /* Most reads will have been moved so safest to always recalculate the BIN value */ b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b)); next_seq: sam_write1(out, h, b); } if (read_ret < -1) { fprintf(stderr, "[depad] truncated file.\n"); ret = 1; } free(r.s); free(q.s); free(posmap); bam_destroy1(b); return ret; }
int main(int argc, char **argv) { if (argc < 4) errx(1, "usage\t:%s <bam> <split out> <discord out> (optional #threads)", argv[0]); char *bam_file_name = argv[1]; char *split_file_name = argv[2]; char *disc_file_name = argv[3]; int threads = 2; if (argc == 5) { threads = atoi(argv[4]); } samFile *disc = sam_open(disc_file_name, "wb"); samFile *split = sam_open(split_file_name, "wb"); samFile *in = sam_open(bam_file_name, "rb"); if(in == NULL) errx(1, "Unable to open BAM/SAM file."); // TODO: handle cram. if (threads > 1) { bgzf_mt(in->fp.bgzf, threads, 256); } hts_idx_t *idx = sam_index_load(in, bam_file_name); if(idx == NULL) errx(1,"Unable to open BAM/SAM index."); bam_hdr_t *hdr = sam_hdr_read(in); int r = sam_hdr_write(disc, hdr); r = sam_hdr_write(split, hdr); bam1_t *aln = bam_init1(); int ret; while(ret = sam_read1(in, hdr, aln) >= 0) { if (((aln->core.flag) & 1294) == 0) r = sam_write1(disc, hdr, aln); uint8_t *sa = bam_aux_get(aln, "SA"); if (sa != 0) { char *sa_tag = strdup(bam_aux2Z(sa)); if ( count_tags(sa_tag) == 1) { char *chrm, strand, *cigar; uint32_t pos; split_sa_tag(sa_tag, &chrm, &pos, &strand, &cigar); struct line sa, al; calcOffsets(cigar, pos, strand, &sa); sa.chrm = chrm; sa.strand = strand; calcAlnOffsets(bam_get_cigar(aln), aln->core.n_cigar, aln->core.pos, bam_is_rev(aln) ? '-' : '+', &al); al.chrm = hdr->target_name[aln->core.tid]; al.strand = bam_is_rev(aln) ? '-' : '+'; struct line *left = &al, *right = &sa; if (left->SQO > right->SQO) { left = &sa; right = &al; } int overlap = MAX(1 + MIN(left->EQO, right->EQO) - MAX(left->SQO, right->SQO), 0); int alen1 = 1 + left->EQO - left->SQO; int alen2 = 1 + right->EQO - right->SQO; int mno = MIN(alen1-overlap, alen2-overlap); if (mno < MIN_NON_OVERLAP) continue; if ( (strcmp(left->chrm, right->chrm) == 0) && (left->strand == right->strand) ) { int leftDiag, rightDiag, insSize; if (left->strand == '-') { leftDiag = left->rapos - left->sclip; rightDiag = (right->rapos + right->raLen) - (right->sclip + right->qaLen); insSize = rightDiag - leftDiag; } else { leftDiag = (left->rapos + left->raLen) - (left->sclip + left->qaLen); rightDiag = right->rapos - right->sclip; insSize = leftDiag - rightDiag; } int desert = right->SQO - left->EQO - 1; if ((abs(insSize) < MIN_INDEL_SIZE) || ((desert > 0) && ( (desert - (int)MAX(0, insSize)) > MAX_UNMAPPED_BASES))) continue; } char *qname = bam_get_qname(aln); if ((aln->core.flag & 64) == 64) qname[0] = 'A'; else qname[0] = 'B'; r = sam_write1(split, hdr, aln); } free(sa_tag); } } bam_destroy1(aln); hts_idx_destroy(idx); bam_hdr_destroy(hdr); sam_close(in); sam_close(disc); sam_close(split); if(ret < -1) { errx(1, "lumpy_filter: error reading bam: %s\n", bam_file_name); } }
int calcCoverage(char *fName, Slice *slice, htsFile *in, hts_idx_t *idx, int flags) { int ref; int begRange; int endRange; char region[1024]; char region_name[512]; if (Slice_getChrStart(slice) != 1) { fprintf(stderr, "Currently only allow a slice start position of 1\n"); return 1; } if (flags & M_UCSC_NAMING) { sprintf(region,"chr%s", Slice_getSeqRegionName(slice)); } else { sprintf(region,"%s", Slice_getSeqRegionName(slice)); } bam_hdr_t *header = bam_hdr_init(); header = bam_hdr_read(in->fp.bgzf); ref = bam_name2id(header, region); if (ref < 0) { fprintf(stderr, "Invalid region %s\n", region); exit(1); } sprintf(region,"%s:%ld-%ld", region_name, Slice_getSeqRegionStart(slice), Slice_getSeqRegionEnd(slice)); if (hts_parse_reg(region, &begRange, &endRange) == NULL) { fprintf(stderr, "Could not parse %s\n", region); exit(2); } bam_hdr_destroy(header); hts_itr_t *iter = sam_itr_queryi(idx, ref, begRange, endRange); bam1_t *b = bam_init1(); Coverage *coverage = calloc(Slice_getLength(slice),sizeof(Coverage)); long counter = 0; long overlapping = 0; long bad = 0; int startIndex = 0; while (bam_itr_next(in, iter, b) >= 0) { if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) { bad++; continue; } int end; //end = bam_calend(&b->core, bam1_cigar(b)); end = bam_endpos(b); // There is a special case for reads which have zero length and start at begRange (so end at begRange ie. before the first base we're interested in). // That is the reason for the || end == begRange test if (end == begRange) { continue; } counter++; if (!(counter%1000000)) { if (verbosity > 1) { printf("."); } fflush(stdout); } // Remember: b->core.pos is zero based! int cigInd; int refPos; int readPos; uint32_t *cigar = bam_get_cigar(b); for (cigInd = readPos = 0, refPos = b->core.pos; cigInd < b->core.n_cigar; ++cigInd) { int k; int lenCigBlock = cigar[cigInd]>>4; int op = cigar[cigInd]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (k = 0; k < lenCigBlock; ++k) { //if (ref[refPos+k] == 0) break; // out of boundary coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; readPos += lenCigBlock; } else if (op == BAM_CDEL) { for (k = 0; k < lenCigBlock; ++k) { // if (ref[refPos+k] == 0) break; coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; } else if (op == BAM_CSOFT_CLIP) { readPos += lenCigBlock; } else if (op == BAM_CHARD_CLIP) { } else if (op == BAM_CINS) { readPos += lenCigBlock; } else if (op == BAM_CREF_SKIP) { refPos += lenCigBlock; } } #ifdef DONE int j; int done = 0; int hadOverlap = 0; for (j=startIndex; j < Vector_getNumElement(genes) && !done; j++) { Gene *gene = Vector_getElementAt(genes,j); if (!gene) { continue; } // Remember: b->core.pos is zero based! if (b->core.pos < Gene_getEnd(gene) && end >= Gene_getStart(gene)) { int k; int doneGene = 0; for (k=0; k<Gene_getTranscriptCount(gene) && !doneGene; k++) { Transcript *trans = Gene_getTranscriptAt(gene,k); if (b->core.pos < Transcript_getEnd(trans) && end >= Transcript_getStart(trans)) { int m; for (m=0; m<Transcript_getExonCount(trans) && !doneGene; m++) { Exon *exon = Transcript_getExonAt(trans,m); if (b->core.pos < Exon_getEnd(exon) && end >= Exon_getStart(exon)) { // Only count as overlapping once (could be that a read overlaps more than one gene) if (!hadOverlap) { overlapping++; hadOverlap = 1; } gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); gs->score++; doneGene = 1; } } } } } else if (Gene_getStart(gene) > end) { done = 1; } else if (Gene_getEnd(gene) < b->core.pos+1) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); if (verbosity > 1) { printf("Removing gene %s (index %d) with extent %d to %d\n", Gene_getStableId(gene), gs->index, Gene_getStart(gene), Gene_getEnd(gene)); } Vector_setElementAt(genes,j,NULL); // Magic (very important for speed) - move startIndex to first non null gene int n; startIndex = 0; for (n=0;n<Vector_getNumElement(genes);n++) { void *v = Vector_getElementAt(genes,n); if (v != NULL) { break; } startIndex++; } if (verbosity > 1) { printf("startIndex now %d\n",startIndex); } } } #endif } if (verbosity > 1) { printf("\n"); } #ifdef DONE // Print out read counts for what ever's left in the genes array int n; for (n=0;n<Vector_getNumElement(genes);n++) { Gene *gene = Vector_getElementAt(genes,n); if (gene != NULL) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); } } #endif printf("Read %ld reads. Number of bad reads (unmapped, qc fail, secondary, dup) %ld\n", counter, bad); long i; for (i=0; i< Slice_getLength(slice); i++) { printf("%ld %ld\n", i+1, coverage[i].coverage); } sam_itr_destroy(iter); bam_destroy1(b); return 1; }
static bool sam_fetch_coords(const CallFileEntry *centry, const char *flank5p, size_t flank5p_len, const char *flank3p, size_t flank3p_len, size_t *cpy_flnk_5p, size_t *cpy_flnk_3p, const read_t **chrom_ptr, size_t *start, size_t *end, bool *fw_strand_ptr) { // Get the next primary alignment do { if(sam_read1(samfh, bam_header, bamentry) < 0) die("We've run out of SAM entries!"); } while(bamentry->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); if(bamentry->core.flag & BAM_FUNMAP) { num_flank5p_unmapped++; return false; } if(bamentry->core.qual < min_mapq) { num_flank5p_lowqual++; return false; } bool fw_strand = !bam_is_rev(bamentry); *fw_strand_ptr = fw_strand; const char *chrom_name = bam_header->target_name[bamentry->core.tid]; const read_t *chrom = seq_fetch_chrom(genome, chrom_name); *chrom_ptr = chrom; const uint32_t *cigar = bam_get_cigar(bamentry); int cigar2rlen = bam_cigar2rlen(bamentry->core.n_cigar, cigar); // cpy_flnk_5p is soft clipped at right end of flank // Eat up hard masked (H), soft masked (S) and inserted bases (relative to ref) (I) *cpy_flnk_5p = bam_get_end_padding(bamentry->core.n_cigar, cigar); *cpy_flnk_3p = 0; // set this later // Get bam query name char *bname = bam_get_qname(bamentry); // Check entry/flank names match const char *hdrline = call_file_get_line(centry, 0); if(hdrline[0] != '>') die("Unexpected line: %s", hdrline); hdrline++; const char *hdrline_end = str_fasta_name_end(hdrline); int hdrline_len = hdrline_end - hdrline; if(strncmp(hdrline, bname, hdrline_len) != 0) die("SAM/BAM and call entries mismatch '%s' vs '%s'", hdrline, bname); // Find 3p flank position using search for first kmer char endkmer[200]; ctx_assert(kmer_size+1 <= sizeof(endkmer)); ctx_assert(flank3p_len >= kmer_size || call_file_min_allele_len(centry) == 0); bubble_get_end_kmer(flank5p, flank5p_len, flank3p, flank3p_len, kmer_size, endkmer); if(!fw_strand) dna_revcomp_str(endkmer, endkmer, kmer_size); // Determine search space // Choose a region of the ref to search for the end flank // end is index after last char long search_start, search_end; size_t longest_allele = call_file_max_allele_len(centry); if(fw_strand) { search_start = (long)bamentry->core.pos + cigar2rlen - kmer_size*2; search_end = (long)bamentry->core.pos + cigar2rlen + longest_allele + kmer_size*2 + 10; } else { search_start = (long)bamentry->core.pos - (longest_allele + kmer_size*2 + 10); search_end = (long)bamentry->core.pos + kmer_size*2; } search_start = MAX2(search_start, 0); search_end = MIN2(search_end, (long)chrom->seq.end); const char *search_region = chrom->seq.b + search_start; size_t search_len = (size_t)(search_end - search_start); // Now do search with kmer // Attempt to find perfect match for kmer within search region // Search, if there is more than one match -> abandon const char *kmer_match = ctx_strnstr(search_region, endkmer, search_len); if(kmer_match != NULL) { // Check for multiple hits size_t rem_search_len = search_region+search_len-kmer_match; if(ctx_strnstr(kmer_match+1, endkmer, rem_search_len-1) != NULL) { num_flank3p_multihits++; return false; } if(fw_strand) { *start = bamentry->core.pos + cigar2rlen; *end = kmer_match - chrom->seq.b; } else { *start = kmer_match + kmer_size - chrom->seq.b; *end = bamentry->core.pos; } num_flank3p_exact_match++; return true; } else { // Look for approximate match needleman_wunsch_align2(search_region, endkmer, search_len, kmer_size, &nw_scoring_flank, nw_aligner, aln); num_nw_flank++; const char *ref = aln->result_a, *alt = aln->result_b; // --aa--dd-cge // bb--ccd-ecge // Find positions of first and last match int i, l, r, matches = 0; int ref_offset_left = 0, ref_offset_rght = 0; int alt_offset_left = 0, alt_offset_rght = 0; for(l = 0; l < (int)aln->length && ref[l] != alt[l]; l++) { ref_offset_left += (ref[l] != '-'); alt_offset_left += (alt[l] != '-'); } for(r = aln->length-1; r > 0 && ref[r] != alt[r]; r--) { ref_offset_rght += (ref[r] != '-'); alt_offset_rght += (alt[r] != '-'); } // Count matches for(i = l; i <= r; i++) matches += (ref[i] == alt[i]); if(matches < (int)kmer_size / 2) { // flank doesn't map well num_flank3p_not_found++; return false; } num_flank3p_approx_match++; *cpy_flnk_3p += fw_strand ? alt_offset_left : alt_offset_rght; if(fw_strand) { *start = bamentry->core.pos + cigar2rlen; *end = search_region + ref_offset_left - chrom->seq.b; } else { *start = (search_region + search_len - ref_offset_rght) - chrom->seq.b; *end = bamentry->core.pos; } return true; } }
int do_grep() { #ifdef DEBUGa printf("[!]do_grep\n"); #endif BamInfo_t *pbam; kh_cstr_t BamID; khiter_t ki, bami; kstring_t ks1 = { 0, 0, NULL }; kstring_t ks2 = { 0, 0, NULL }; kstring_t ks3 = { 0, 0, NULL }; samFile *in; bam_hdr_t *h; hts_idx_t *idx; bam1_t *b, *d, *d2, *bR1, *bR2, *bR3; bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1(); //htsFile *out; //hts_opt *in_opts = NULL, *out_opts = NULL; int r = 0, exit_code = 0; kvec_t(bam1_t) R1, R2, RV; pierCluster_t *pierCluster; //samdat_t tmp_samdat; FILE *fs = fopen("./test.txt","w"); for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) { //printf(">[%d]:\n",bami); if (kh_exist(bamNFOp, bami)) { kv_init(R1); kv_init(R2); kv_init(RV); //tmp_samdat = (const samdat_t){ 0 }; //memset(&tmp_samdat,0,sizeof(samdat_t)); //printf("-[%d]:\n",bami); BamID = kh_key(bamNFOp, bami); pbam = &kh_value(bamNFOp, bami); fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD); in = sam_open(pbam->fileName, "r"); if (in == NULL) { fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } h = sam_hdr_read(in); /* out = hts_open("-", "w"); if (out == NULL) { fprintf(stderr, "[x]Error opening standard output\n"); return EXIT_FAILURE; } if (sam_hdr_write(out, h) < 0) { fprintf(stderr, "[!]Error writing output header.\n"); exit_code = 1; } */ int8_t *ChrIsHum; if (h == NULL) { fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } else { ChrIsHum = malloc(h->n_targets * sizeof(int8_t)); for (int32_t i=0; i < h->n_targets; ++i) { //ChrIsHum[i] = -1; ki = kh_get(chrNFO, chrNFOp, h->target_name[i]); if (ki == kh_end(chrNFOp)) { errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]); } else { ChrInfo_t * tmp = &kh_value(chrNFOp, ki); ChrIsHum[i] = tmp->isHum; //printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]); } } } h->ignore_sam_err = 0; b = bam_init1(); d = bam_init1(); d2 = bam_init1(); if ((idx = sam_index_load(in, pbam->fileName)) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } pierCluster = sam_plp_init(); while ((r = sam_read1(in, h, b)) >= 0) { int8_t flag = false; const bam1_core_t *c = &b->core; if (c->flag & BAM_FSECONDARY) continue; if (c->n_cigar) { uint32_t *cigar = bam_get_cigar(b); for (int i = 0; i < c->n_cigar; ++i) { if (bam_cigar_opchr(cigar[i])=='S') { // soft clipping if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) { flag = true; } } } } if (flag && ChrIsHum[c->tid]) { // Now, skip Virus items. //bam_copy1(bR1, b); flag = 0; // recycle //int enoughMapQ = 0; //kstring_t ks = { 0, 0, NULL }; /*if (sam_format1(h, b, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else*/ if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) { // Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况. //printf(">[%s]\n",ks_str(&ks1)); flag |= 1; //tmp_samdat.b = bam_dup1(b); //kv_push(samdat_t,R1,tmp_samdat); /*if (checkMapQ(ChrIsHum, b, true)) { ++enoughMapQ; }*/ } if (getPairedSam(in, idx, b, d) != 0) { flag &= ~1; continue; } else { flag |= 2; /*if (checkMapQ(ChrIsHum, d, false)) { ++enoughMapQ; }*/ /*if (c->flag & BAM_FSECONDARY) { if (getPairedSam(in, idx, d, d2) == 0) { //sam_format1(h, d2, &ks3); flag |= 4; if (checkMapQ(ChrIsHum, d2, false)) { ++enoughMapQ; } } }*/ } /* 对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。 >[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 353 chr2 13996555 0 50S40M chr18 48245109 0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:40 AS:i:40 XS:i:40 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0; YC:Z:CT YD:Z:f] -[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 177 chr18 48245109 9 40S50M gi|59585|emb|X04615.1|2000 0 GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:50 AS:i:50 XS:i:46 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0; YC:Z:GA YD:Z:f] +[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 113 gi|59585|emb|X04615.1| 2000 60 40S46M4S chr18 48245109 0 TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:46 AS:i:46 XS:i:27 RG:Z:Fsimout_mB SA:Z:fchr2,13996555,+,50S40M,0,0; YC:Z:CT YD:Z:r] */ /*if (sam_format1(h, d, &ks2) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; }*/ if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) { /*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1)); printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2)); if (flag & 4) { printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3)); } printf("<--%d\n",enoughMapQ);*/ if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) { //printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) sam_plp_push(ChrIsHum, pierCluster, d); //if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) sam_plp_push(ChrIsHum, pierCluster, d2); } else { //print fprintf(fs,"[%s]\nHumRange=%s:%d-%d\n", BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos); for (size_t i=0; i<kv_size(pierCluster->Reads);++i) { bam1_t *bi = kv_A(pierCluster->Reads, i); if (sam_format1(h, bi, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else { fprintf(fs,"%s\n",ks1.s); } } fprintf(fs,"\n"); //printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); //fflush(fs); sam_plp_dectroy(pierCluster); pierCluster = sam_plp_init(); } } } /*char *qname = bam_get_qname(b); if (sam_write1(out, h, b) < 0) { fprintf(stderr, "[x]Error writing output.\n"); exit_code = 1; break; }*/ } /* r = sam_close(out); // stdout can only be closed once if (r < 0) { fprintf(stderr, "Error closing output.\n"); exit_code = 1; } */ hts_idx_destroy(idx); bam_destroy1(b); bam_destroy1(d); bam_destroy1(d2); bam_hdr_destroy(h); r = sam_close(in); free(ChrIsHum); #ifdef DEBUGa fflush(NULL); //pressAnyKey(); #endif sam_plp_dectroy(pierCluster); //printf("<[%d]:\n",bami); } } fclose(fs); getPairedSam(NULL, NULL, NULL, NULL); // sam_close(fp2); //printf("---[%d]---\n",exit_code); bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3); ks_release(&ks1); ks_release(&ks2); ks_release(&ks3); return exit_code; }
void filterReads(char * inBamFile, char * outBamFile, int minMapQual, int minLen, int maxMisMatches, float minPcId, float minPcAln, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { // int result = -1; int outResult = -1; int supp_check = 0x0; if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // helper variables BGZF* in = 0; BGZF* out = 0; bam1_t *b = bam_init1(); bam_hdr_t *h; // open bam if ((in = bgzf_open(inBamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", inBamFile); } else if ((h = bam_hdr_read(in)) == 0) { // read header fprintf(stderr, "ERROR: Failed to read BAM header of file \"%s\".\n", inBamFile); } else if ((out = bgzf_open(outBamFile, "w")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for writing.\n", outBamFile); } else { // write and destroy header bam_hdr_write(out, h); bam_hdr_destroy(h); int line = 0; int matches, mismatches, qLen; float pcAln, pcId; int showStats = 0; // fetch alignments while ((result = bam_read1(in, b)) >= 0) { line += 1; // only primary mappings if ((b->core.flag & supp_check) != 0) { if (showStats) fprintf(stdout, "Rejected %d, non-primary\n", line); continue; } // only high quality if (b->core.qual < minMapQual) { if (showStats) fprintf(stdout, "Rejected %d, quality: %d\n", line, b->core.qual); continue; } // not too many absolute mismatches mismatches = bam_aux2i(bam_aux_get(b, "NM")); if (mismatches > maxMisMatches) { if (showStats) fprintf(stdout, "Rejected %d, mismatches: %d\n", line, mismatches); continue; } // not too short qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b)); if (qLen < minLen) { if (showStats) fprintf(stdout, "Rejected %d, length: %d\n", line, qLen); continue; } // only high percent identity matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b)); pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1 if (pcId < minPcId) { if (showStats) fprintf(stdout, "Rejected %d, identity pc: %.4f\n", line, pcId); continue; } // only high percent alignment pcAln = matches / (float)qLen; // percentage as float between 0 to 1 if (pcAln < minPcAln) { if (showStats) fprintf(stdout, "Rejected %d, alignment pc: %.4f\n", line, pcAln); continue; } if ((outResult = bam_write1(out, b)) < -1) { fprintf(stderr, "ERROR: Attempt to write read no. %d to file \"%s\" failed with code %d.\n", line, outBamFile, outResult); } } if (result < -1) { fprintf(stderr, "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n", line, inBamFile, result); } } if (in) bgzf_close(in); if (out) bgzf_close(out); bam_destroy1(b); }
/** * Gets the base in the read that is mapped to a genomic position. * Extracts the read sequence and aualities too. */ void bam_get_base_and_qual_and_read_and_qual(bam1_t *srec, uint32_t pos, char& base, char& qual, int32_t& rpos, kstring_t* readseq, kstring_t* readqual) { bam1_core_t *c = &srec->core; int32_t rlen = c->l_qseq; uint32_t cpos = c->pos; //reference coordinates of the first mapped base rpos = 0; //read coordinates kstring_t str; str.l = str.m = 0, str.s = 0; base = 'N'; qual = 0; if (c->n_cigar) { uint32_t *cigar = bam_get_cigar(srec); for (uint32_t i = 0; i < c->n_cigar; ++i) { char op = bam_cigar_opchr(cigar[i]); str.l = 0; kputw(bam_cigar_oplen(cigar[i]), &str); char* stop; uint32_t len = strtol(str.s, &stop, 10); assert(stop); if (op=='M') { if (pos>=cpos && pos<=cpos+len-1) { rpos += pos-cpos; break; } cpos += len; rpos += len; } else if (op=='D') { if (pos>=cpos && pos<=cpos+len-1) { rpos = -1; break; } cpos += len; } else if (op=='S' || op=='I') { rpos += len; } } //std::cout << "bpos " << bpos << "\n"; if (rpos>=0 && rpos<=rlen) { //sequence bam_get_seq_string(srec, readseq); base = readseq->s[rpos]; //qual bam_get_qual_string(srec, readqual); qual = readqual->s[rpos]; } else { rpos = BAM_READ_INDEX_NA; } } // std::cout << "b: " << base << "\n"; // std::cout << "q: " << s[bpos-1] << " " << q << "\n"; // for (uint32_t i = 0; i < c->l_qseq; ++i) std::cerr << ((char)(s[i] + 33)); };