/* * This function calculates ct tag for two bams, it assumes they are from the same template and * writes the tag to the first read in position terms. */ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) { bam1_t *swap; int i, end; uint32_t *cigar; str->l = 0; if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam_get_cigar(b1); i < b1->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } end = bam_endpos(b1); kputw(b2->core.pos - end, str); kputc('T', str); kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam_get_cigar(b2); i < b2->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } uint8_t* data; if ((data = bam_aux_get(b1,"ct")) != NULL) bam_aux_del(b1, data); if ((data = bam_aux_get(b2,"ct")) != NULL) bam_aux_del(b2, data); bam_aux_append(b1, "ct", 'Z', str->l+1, (uint8_t*)str->s); }
void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) { bam1_t *swap; int i, end; uint32_t *cigar; str->l = 0; if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } end = bam_calend(&b1->core, cigar); kputw(b2->core.pos - end, str); kputc('T', str); kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); }
int bam_cigar2ulen(int n_cigar, const uint32_t *cigar) { int k, l; for (k = l = 0; k < n_cigar; ++k) { if (bam_cigar_type(bam_cigar_op(cigar[k])) &1) { l += bam_cigar_oplen(cigar[k]); } else if (bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) { l += bam_cigar_oplen(cigar[k]); } } return l; }
int bam_cigar2matches(int n_cigar, const uint32_t *cigar) { int k, l; for (k = l = 0; k < n_cigar; ++k) if (bam_cigar_type(bam_cigar_op(cigar[k]))==3) l += bam_cigar_oplen(cigar[k]); return l; }
CigarParser::CigarParser(uint32_t const* cigar, int len) : cigar_(cigar) , len_(len) , readPos_(0) , refPos_(0) , currentOpIdx_(0) , currentOp_(bam_cigar_op(*cigar)) , currentOpLen_(bam_cigar_oplen(*cigar)) , started_(false) { }
void calcAlnOffsets(uint32_t *cigar, uint32_t n_cigar, uint32_t sa_pos, char sa_strand, struct line *l) { l->raLen = 0; l->qaLen = 0; l->sclip = 0; l->eclip = 0; l->SQO = 0; l->EQO = 0; bool first = true; uint32_t k; for (k = 0; k < n_cigar; ++k) { uint32_t opLen = bam_cigar_oplen(cigar[k]); char opCode = bam_cigar_opchr(cigar[k]); if (opCode == 'M' || opCode == '=' || opCode == 'X') { l->raLen += opLen; l->qaLen += opLen; first = false; } else if (opCode == 'S' || opCode == 'H') { if (first) l->sclip += opLen; else l->eclip += opLen; } else if (opCode == 'D' || opCode == 'N') { l->raLen += opLen; } else if (opCode == 'I') { l->qaLen += opLen; } } //*rapos = str2pos(line->fields[POS]); l->rapos = sa_pos; if (sa_strand == '+') { l->pos = l->rapos - l->sclip; l->SQO = l->sclip; l->EQO = l->sclip + l->qaLen - 1; } else { l->pos = l->rapos + l->raLen + l->eclip - 1; l->SQO = l->eclip; l->EQO = l->eclip + l->qaLen - 1; } }
// Adapted from samtools/bam.c int32_t b2g_bam_clippedlength(bam1_t *b) { const bam1_core_t c = b->core; const uint32_t *cigar = bam1_cigar(b); uint32_t k; int32_t l = 0; for (k = 0; k < c.n_cigar; ++k) { if ('S' == bam_cigar_opchr(bam_cigar_op(cigar[k]))) { l += bam_cigar_oplen(cigar[k]); } } return l; }
// Return sum of bases on right of alignment with: // * hard masked (H) // * soft masked (S) // * inserted bases relative to ref (I) static inline uint32_t bam_get_end_padding(int n_cigar, const uint32_t *cigar) { ctx_assert(n_cigar > 0); uint32_t i, l = 0; const uint32_t c = (1<<BAM_CINS)|(1<<BAM_CSOFT_CLIP)|(1<<BAM_CHARD_CLIP); for(i = n_cigar-1; i > 0; i--) if((c >> bam_cigar_op(cigar[i])) & 1) l += bam_cigar_oplen(cigar[i]); return l; }
/* Make a node containing an InDel * * b The input read * cigar_op_num The operation number of the Insertion/Deletion * * returns a node, that must be either inserted into the linked list or * destroyed with destroyNode() */ InDel *makeNode(bam1_t *b, int cigar_op_num) { int i, op, oplen, quit = 0, type; int32_t start = b->core.pos-1; int32_t end; uint32_t *cigar = bam_get_cigar(b); InDel *node; for(i=0; i<cigar_op_num; i++) { oplen = bam_cigar_oplen(cigar[i]); type = bam_cigar_type(bam_cigar_op(cigar[i])); if(type & 2) start += oplen; } end = ++start; for(i=cigar_op_num; i<b->core.n_cigar; i++) { op = bam_cigar_op(cigar[i]); oplen = bam_cigar_oplen(cigar[i]); switch(op) { case 1: //I case 2: //D end = (end>start+oplen) ? end : start+oplen; break; default : quit = 1; break; } if(quit) break; } //Make the node node = calloc(1, sizeof(InDel)); node->tid = b->core.tid; node->start = start; node->end = end; node->count = 1; return node; }
/** * Gets the cigar from a BAM record */ void bam_get_cigar_string(bam1_t *s, kstring_t *cigar_string) { cigar_string->l=0; int32_t n_cigar_op = bam_get_n_cigar_op(s); if (n_cigar_op) { uint32_t *cigar = bam_get_cigar(s); for (int32_t i = 0; i < n_cigar_op; ++i) { kputw(bam_cigar_oplen(cigar[i]), cigar_string); kputc(bam_cigar_opchr(cigar[i]), cigar_string); } } }
void CigarParser::advance() { int type = bam_cigar_type(currentOp_); if (type & BAM_CONSUME_REFERENCE) { refPos_ += currentOpLen_; } if (type & BAM_CONSUME_QUERY) { readPos_ += currentOpLen_; } ++currentOpIdx_; assert(currentOpIdx_ < len_); currentOp_ = bam_cigar_op(cigar_[currentOpIdx_]); currentOpLen_ = bam_cigar_oplen(cigar_[currentOpIdx_]); }
unsigned seq_lens_from_bin_cigar (uint32_t* cigar_bin, unsigned cigar_bin_sz, unsigned* q_len, unsigned* r_len) { unsigned oplen, constype; uint32_t *sent; *q_len = *r_len = 0; unsigned allen = 0; for (sent = cigar_bin + cigar_bin_sz; cigar_bin != sent; ++cigar_bin) { oplen = bam_cigar_oplen (*cigar_bin); constype = bam_cigar_type (*cigar_bin); if (constype & CONSUME_QRY) *q_len += oplen; if (constype & CONSUME_REF) *r_len += oplen; allen += oplen; } return allen; }
// Returns 0 on success, -1 on failure. static int bam_format_cigar(const bam1_t* b, kstring_t* str) { // An empty cigar is a special case return "*" rather than "" if (b->core.n_cigar == 0) { return (kputc('*', str) == EOF) ? -1 : 0; } const uint32_t *cigar = bam_get_cigar(b); uint32_t i; for (i = 0; i < b->core.n_cigar; ++i) { if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1; if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1; } return 0; }
static int32_t unclipped_start(bam1_t *b) { uint32_t *cigar = bam_get_cigar(b); int32_t clipped = 0; uint32_t i; for (i = 0; i < b->core.n_cigar; i++) { char c = bam_cigar_opchr(cigar[i]); if (c == 'S' || c == 'H') { // clips clipped += bam_cigar_oplen(cigar[i]); } else { break; } } return b->core.pos - clipped + 1; }
// Returns 0 to indicate read should be output 1 otherwise static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) { if (settings->remove_B) bam_remove_B(b); if (settings->min_qlen > 0) { int k, qlen = 0; uint32_t *cigar = bam_get_cigar(b); for (k = 0; k < b->core.n_cigar; ++k) if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) qlen += bam_cigar_oplen(cigar[k]); if (qlen < settings->min_qlen) return 1; } if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off)) return 1; if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); if ((double)(k&0xffffff) / 0x1000000 >= settings->subsam_frac) return 1; } if (settings->rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1)); if (k == kh_end(settings->rghash)) return 1; } } if (settings->library) { const char *p = bam_get_library((bam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; } if (settings->remove_aux_len) { size_t i; for (i = 0; i < settings->remove_aux_len; ++i) { uint8_t *s = bam_aux_get(b, settings->remove_aux[i]); if (s) { bam_aux_del(b, s); } } } return 0; }
static void unpad_seq(bam1_t *b, kstring_t *s) { int k, j, i; uint32_t *cigar = bam1_cigar(b); uint8_t *seq = bam1_seq(b); ks_resize(s, b->core.l_qseq); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP); if (op == BAM_CMATCH) { for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j); ++j; } else if (op == BAM_CSOFT_CLIP) { j += ol; } else { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } } }
static int unpad_seq(bam1_t *b, kstring_t *s) { // Returns 0 on success, -1 on an error int k, j, i; int length; int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */ uint32_t *cigar = bam_get_cigar(b); uint8_t *seq = bam_get_seq(b); // b->core.l_qseq gives length of the SEQ entry (including soft clips, S) // We need the padded length after alignment from the CIGAR (excluding // soft clips S, but including pads from CIGAR D operations) length = bam_cigar2rlen(b->core.n_cigar, cigar); ks_resize(s, length); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j); } else if (op == BAM_CSOFT_CLIP) { j += ol; } else if (op == BAM_CHARD_CLIP) { /* do nothing */ } else if (op == BAM_CDEL) { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } else if (op == BAM_CREF_SKIP) { /* Treat CIGAR N as D (not ideal, but better than ignoring it) */ for (i = 0; i < ol; ++i) s->s[s->l++] = 0; if (0 == cigar_n_warning) { cigar_n_warning = -1; fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); } } else { fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); return -1; } } return length != s->l; }
unsigned alignment_bounds_from_bin_cigar (uint32_t* cigar_bin, unsigned cigar_bin_sz, uint8_t forward, unsigned qry_len, unsigned* q_beg, unsigned* q_end, unsigned* r_beg, unsigned* r_end) { unsigned oplen, op, constype; uint32_t *sent; *q_beg = *q_end = *r_beg = *r_end = 0; unsigned allen = 0; uint32_t tail = 0; for (sent = cigar_bin + cigar_bin_sz; cigar_bin != sent; ++cigar_bin) { oplen = bam_cigar_oplen (*cigar_bin); op = bam_cigar_op (*cigar_bin); constype = bam_cigar_type (*cigar_bin); if (tail && (op == BAM_CHARD_CLIP || op == BAM_CSOFT_CLIP)) // the aligned zone ended, clip started. Note that tail indels are not valid, so we do not assume they are possible.. break; if (op != BAM_CHARD_CLIP && op != BAM_CSOFT_CLIP) tail = 1; if (constype & CONSUME_QRY) { if (!tail) *q_beg += oplen; *q_end += oplen; } if (constype & CONSUME_REF) { if (!tail) *r_beg += oplen; *r_end += oplen; } allen += oplen; } if (!forward) { unsigned tmp = qry_len - *q_beg; *q_beg = qry_len - *q_end; *q_end = tmp; } return allen; }
static int32_t unclipped_end(bam1_t *b) { uint32_t *cigar = bam_get_cigar(b); int32_t end_pos, clipped = 0; int32_t i; end_pos = bam_endpos(b); // now get the clipped end bases (if any) // if we get to the beginning of the cigar string // without hitting a non-clip then the results are meaningless for (i = b->core.n_cigar - 1; i >= 0; i--) { char c = bam_cigar_opchr(cigar[i]); if (c == 'S' || c == 'H') { // clips clipped += bam_cigar_oplen(cigar[i]); } else { break; } } return end_pos + clipped; }
void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) { double w; if (no_fractional_weight) w = 1.0; else { uint8_t *p_tag = bam_aux_get(b, "ZW"); if (p_tag == NULL) return; w = bam_aux2f(p_tag); } int pos = b->core.pos; uint32_t *p = bam_get_cigar(b); for (int i = 0; i < (int)b->core.n_cigar; ++i, ++p) { char op = bam_cigar_op(*p); int op_len = bam_cigar_oplen(*p); if (op == BAM_CMATCH) for (int j = 0; j < op_len; ++j, ++pos) wiggle.read_depth[pos] += w; else pos += ((bam_cigar_type(op) & 2) ? op_len : 0); } }
void cigar_print (FILE* f, uint32_t* cigar, unsigned cigar_sz) { uint32_t* sent; for (sent = cigar+cigar_sz; cigar != sent; ++cigar) { uint32_t curop = bam_cigar_op (*cigar); uint32_t count = bam_cigar_oplen (*cigar); char schar; switch (curop) { case BAM_CHARD_CLIP: schar = 'H'; break; case BAM_CSOFT_CLIP: // skip schar = 'S'; break; case BAM_CMATCH: schar = 'M'; break; case BAM_CEQUAL: schar = '='; break; case BAM_CDIFF: schar = '#'; break; case BAM_CINS: schar = 'I'; break; case BAM_CDEL: schar = 'I'; break; default: schar = '?'; } fprintf (f, "%d%c", count, schar); } }
string bam_cigarString (bam1_t *b) {//output CIGAR string // kstring_t strK; // kstring_t *str=&strK; const bam1_core_t *c = &b->core; string cigarString(""); if ( c->n_cigar > 0 ) { uint32_t *cigar = bam_get_cigar(b); for (int i = 0; i < c->n_cigar; ++i) { cigarString+=to_string((uint)bam_cigar_oplen(cigar[i]))+bam_cigar_opchr(cigar[i]); }; }; // if (c->n_cigar) { // cigar // for (int i = 0; i < c->n_cigar; ++i) { // kputw(bam_cigar_oplen(cigar[i]), str); // kputc(bam_cigar_opchr(cigar[i]), str); // } // } else kputc('*', str); // // string cigarString (str->s,str->l); return cigarString; };
// prepare internal structures for clipping and alignment // returns true if realignment was performed bool RealignImp::compute_alignment ( const char* q_seq, unsigned q_len, const char* r_seq, unsigned r_len, int r_pos, bool forward, const uint32_t* cigar, unsigned cigar_sz, uint32_t*& cigar_dest, unsigned& cigar_dest_sz, int& new_pos, bool& already_perfect, bool& clip_failed, bool& alignment_failed, bool& unclip_failed) { already_perfect = false; alignment_failed = false; unclip_failed = false; unsigned oplen; const char* q_seq_clipped = q_seq; const uint32_t* cigar_clipped = cigar; unsigned cigar_sz_clipped = cigar_sz; unsigned sclip_q_len, sclip_r_len, sclip_al_len; assert (cigar_sz); // reset realigner Reset (); // set clipping SetClipping ((int) cliptype_, forward); // clip out the hard and soft clipping zones from 5" and 3" // The 'cut out' of the q_seq is done by switching to downstream pointer. if (bam_cigar_op (*cigar) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (*cigar); ClipStart (oplen); q_seq_clipped += oplen; ++cigar_clipped; --cigar_sz_clipped; } if (cigar_sz > 1 && bam_cigar_op (cigar [cigar_sz - 1]) == BAM_CSOFT_CLIP) { oplen = bam_cigar_oplen (cigar [cigar_sz - 1]); ClipEnd (oplen); --cigar_sz_clipped; } // cigar defines q_seq and t_seq lengths sclip_al_len = seq_lens_from_bin_cigar (cigar_clipped, cigar_sz_clipped, &sclip_q_len, &sclip_r_len); const std::string query (q_seq_clipped, sclip_q_len); const std::string target (r_seq, sclip_r_len); std::string pretty_al; pretty_al.reserve (sclip_al_len); pretty_al_from_bin_cigar (cigar_clipped, cigar_sz_clipped, q_seq_clipped, r_seq, pretty_al); // Realigner requires strings of proper size to be passed to SetSequences SetSequences (query, target, pretty_al, forward); if (!ClipAnchors (clip_failed)) { already_perfect = true; return false; // alignment already good, no imperfect zone to realign found } // TODO avoid automatic vectors to prevent unneeded heap usage vector<MDelement> new_md_vec; vector<CigarOp> new_cigar_vec; unsigned int start_pos_shift; if (!computeSWalignment(new_cigar_vec, new_md_vec, start_pos_shift)) { alignment_failed = true; return false; } if (!addClippedBasesToTags(new_cigar_vec, new_md_vec, q_len)) { unclip_failed = true; return false; // error adding back clipped out zones } if (!LeftAnchorClipped () && start_pos_shift != 0) { // build cigar data only if it is needed // TODO avoid automatic vectors to prevent unneeded heap usage std::vector <CigarOp> cigar_vec; cigar_vector_from_bin (cigar, cigar_sz, cigar_vec); new_pos = updateReadPosition (cigar_vec, start_pos_shift, r_pos); } else new_pos = r_pos; // free (cigar_dest); // TODO: switch to better alignment memory management, avoid heap operations cigar_dest = (uint32_t*) tmap_malloc (sizeof (uint32_t) * new_cigar_vec.size (), "cigar_dest"); cigar_dest_sz = new_cigar_vec.size (); cigar_vector_to_bin (new_cigar_vec, cigar_dest); return true; }
static aln_t align_read_against_one(kseq_t *target, const int read_len, uint8_t *read_num, kswq_t **qry, const align_config_t *conf, const int min_score) { uint8_t *ref_num = calloc(target->seq.l, sizeof(uint8_t)); for (size_t k = 0; k < target->seq.l; ++k) ref_num[k] = conf->table[(int)target->seq.s[k]]; aln_t aln; aln.cigar = NULL; aln.loc = ksw_align(read_len, read_num, target->seq.l, ref_num, conf->m, conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, qry); aln.target_name = target->name.s; if (aln.loc.score < min_score) { free(ref_num); return aln; } ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb], aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m, conf->mat, conf->gap_o, conf->gap_e, conf->bandwidth, &aln.n_cigar, &aln.cigar); aln.nm = 0; size_t qi = aln.loc.qb, ri = aln.loc.tb; for (int k = 0; k < aln.n_cigar; k++) { const int32_t oplen = bam_cigar_oplen(aln.cigar[k]), optype = bam_cigar_type(aln.cigar[k]); if (optype & 3) { // consumes both - check for mismatches for (int j = 0; j < oplen; j++) { if (UNLIKELY(read_num[qi + j] != ref_num[ri + j])) aln.nm++; } } else { aln.nm += oplen; } if (optype & 1) qi += oplen; if (optype & 2) ri += oplen; } free(ref_num); /* size_t cigar_len = aln.loc.qb; */ /* for (int c = 0; c < aln.n_cigar; c++) { */ /* int32_t length = (0xfffffff0 & *(aln.cigar + c)) >> 4; */ /* cigar_len += length; */ /* } */ /* cigar_len += read_len - aln.loc.qe - 1; */ /* if(cigar_len != (size_t)read_len) { */ /* /\* printf("[ig_align] Error: cigar length (score %d) not equal to read length for XXX (target %s): %zu vs %d\n", aln.loc.score, target->name.s, cigar_len, read_len); *\/ */ /* // NOTE: */ /* // It is *really* *f*****g* *scary* that it's spitting out cigars that are not the same length as the query sequence. */ /* // Nonetheless, fixing it seems to involve delving into the depths of ksw_align() and ksw_global(), which would be very time consuming, and the length discrepancy seems to ony appear in very poor matches. */ /* // I.e., poor enough that we will subsequently ignore them in partis/python/waterer.py, so it seems to not screw anything up downstream to just set the length-discrepant matches' scores to zero, such that ig-sw doesn't write them to its sam output. */ /* // Note also that it is not always the lowest- or highest-scoring matches that have discrepant lengths (i.e. setting their scores to zero promotes matches swith poorer scores, but which do not have discrepant lengths. */ /* /\* aln.loc.score = 0; *\/ */ /* aln.cigar = NULL; */ /* } */ return aln; }
static aln_v align_read(const kseq_t *read, const kseq_v targets, const align_config_t *conf) { kseq_t *r; const int32_t read_len = read->seq.l; aln_v result; kv_init(result); kv_resize(aln_t, result, kv_size(targets)); uint8_t *read_num = calloc(read_len, sizeof(uint8_t)); for(size_t k = 0; k < read_len; ++k) read_num[k] = conf->table[(int)read->seq.s[k]]; // Align to each target kswq_t *qry = NULL; for(size_t j = 0; j < kv_size(targets); j++) { // Encode target r = &kv_A(targets, j); uint8_t *ref_num = calloc(r->seq.l, sizeof(uint8_t)); for(size_t k = 0; k < r->seq.l; ++k) ref_num[k] = conf->table[(int)r->seq.s[k]]; aln_t aln; aln.target_idx = j; aln.loc = ksw_align(read_len, read_num, r->seq.l, ref_num, conf->m, conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, &qry); ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb], aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m, conf->mat, conf->gap_o, conf->gap_e, 50, /* TODO: Magic number - band width */ &aln.n_cigar, &aln.cigar); aln.nm = 0; size_t qi = aln.loc.qb, ri = aln.loc.tb; for(size_t k = 0; k < aln.n_cigar; k++) { const int32_t oplen = bam_cigar_oplen(aln.cigar[k]), optype = bam_cigar_type(aln.cigar[k]); if(optype & 3) { // consumes both - check for mismatches for(size_t j = 0; j < oplen; j++) { if(UNLIKELY(read_num[qi + j] != ref_num[ri + j])) aln.nm++; } } else { aln.nm += oplen; } if(optype & 1) qi += oplen; if(optype & 2) ri += oplen; } kv_push(aln_t, result, aln); free(ref_num); } free(qry); free(read_num); ks_introsort(dec_score, kv_size(result), result.a); return result; }
int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) { bam1_t *b = 0; kstring_t r, q; int r_tid = -1; uint32_t *cigar2 = 0; int ret = 0, n2 = 0, m2 = 0, *posmap = 0; b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; int read_ret; while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in' // Cannot depad unmapped CRAM data if (b->core.flag & BAM_FUNMAP) goto next_seq; uint32_t *cigar = bam_get_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); r_tid = b->core.tid; if (0!=unpad_seq(b, &r)) { fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); return -1; }; if (h->target_len[r_tid] != r.l) { fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); return -1; } if (fai) { // Check the embedded reference matches the FASTA file if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); return -1; } assert(r.l == q.l); int i; for (i = 0; i < r.l; ++i) { if (r.s[i] != q.s[i]) { // Show gaps as ASCII 45 fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", h->target_name[b->core.tid], i+1, r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); return -1; } } } write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = update_posmap(posmap, r); } else if (b->core.n_cigar > 0) { int i, k, op; if (b->core.tid < 0) { fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b)); return -1; } else if (b->core.tid == r_tid) { ; // good case, reference available //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); } else if (fai) { if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); r_tid = b->core.tid; // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); } else { fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); return -1; } if (0!=unpad_seq(b, &q)) { fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b)); return -1; }; if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[0]); } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) { write_cigar(cigar2, n2, m2, cigar[0]); if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[1]); } } /* Determine CIGAR operator for each base in the aligned read */ for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); /* Include any pads if starts with an insert */ if (q.s[0] == BAM_CINS) { for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD)); k = 0; } else if (q.s[0] == BAM_CPAD) { // Join 'k' CPAD to our first cigar op CPAD too. for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); } else { k = 0; } /* Count consecutive CIGAR operators to turn into a CIGAR string */ for (i = 1, k++, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) { if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]); } write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); } /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */ int pre_op, post_op; for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) { pre_op = bam_cigar_op(cigar2[i-2]); post_op = bam_cigar_op(cigar2[i]); /* Note don't need to check for X/= as code above will use M only */ if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) { /* This is a redundant P operator */ cigar2[i-1] = 0; // i.e. 0M /* If had same operator either side, combine them in post_op */ if (pre_op == post_op) { /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/ cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op); cigar2[i-2] = 0; // i.e. 0M } } } /* Remove the zero'd operators (0M) */ for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); } /* Even unmapped reads can have a POS value, e.g. if their mate was mapped */ if (b->core.pos != -1) b->core.pos = posmap[b->core.pos]; if (b->core.mtid < 0 || b->core.mpos < 0) { /* Nice case, no mate to worry about*/ // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b)); /* TODO - Warning if FLAG says mate should be mapped? */ /* Clean up funny input where mate position is given but mate reference is missing: */ b->core.mtid = -1; b->core.mpos = -1; } else if (b->core.mtid == b->core.tid) { /* Nice case, same reference */ // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b)); b->core.mpos = posmap[b->core.mpos]; } else { /* Nasty case, Must load alternative posmap */ // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); if (!fai) { fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); return -1; } /* Temporarily load the other reference sequence */ if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); return -1; } posmap = update_posmap(posmap, r); b->core.mpos = posmap[b->core.mpos]; /* Restore the reference and posmap*/ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); } /* Most reads will have been moved so safest to always recalculate the BIN value */ b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b)); next_seq: sam_write1(out, h, b); } if (read_ret < -1) { fprintf(stderr, "[depad] truncated file.\n"); ret = 1; } free(r.s); free(q.s); free(posmap); bam_destroy1(b); return ret; }
int bsstrand_func(bam1_t *b, const samfile_t *in, samfile_t *out, void *data) { bsstrand_data_t *d = (bsstrand_data_t*)data; bsstrand_conf_t *conf = d->conf; const bam1_core_t *c = &b->core; if (c->flag & BAM_FUNMAP){ if (out) samwrite(out, b); d->n_unmapped++; return 0; } fetch_refseq(d->rs, in->header->target_name[c->tid], c->pos, c->pos+1); uint32_t rpos=c->pos+1, qpos=0; int i, nC2T = 0, nG2A = 0; uint32_t j; char rbase, qbase; for (i=0; i<c->n_cigar; ++i) { uint32_t op = bam_cigar_op(bam1_cigar(b)[i]); uint32_t oplen = bam_cigar_oplen(bam1_cigar(b)[i]); switch(op) { case BAM_CMATCH: for(j=0; j<oplen; ++j) { rbase = toupper(getbase_refseq(d->rs, rpos+j)); qbase = bscall(bam1_seq(b), qpos+j); if (rbase == 'C' && qbase == 'T') nC2T += 1; if (rbase == 'G' && qbase == 'A') nG2A += 1; /* printf("%c vs %c\n", toupper(rbase), qbase); */ } rpos += oplen; qpos += oplen; break; case BAM_CINS: qpos += oplen; break; case BAM_CDEL: rpos += oplen; break; case BAM_CSOFT_CLIP: qpos += oplen; break; default: fprintf(stderr, "Unknown cigar, %u\n", op); abort(); } } char key[2] = {'Z','S'}; unsigned char *bsstrand = bam_aux_get(b, key); if (bsstrand) { bsstrand++; double s = similarity(nG2A, nC2T); if (nG2A > 1 && nC2T > 1 && s > 0.5) { if (conf->output_read || conf->output_all_read) printf("F\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); bam_aux_append(b, "OS", 'A', 1, bsstrand); bsstrand[0] = '?'; d->n_fail++; } else if (*bsstrand == '+' && nG2A > nC2T + 2) { if (conf->output_read || conf->output_all_read) printf("W2C\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); bam_aux_append(b, "OS", 'A', 1, bsstrand); bsstrand[0] = '-'; d->n_corr++; } else if (*bsstrand == '-' && nC2T > nG2A + 2) { if (conf->output_read || conf->output_all_read) printf("C2W\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); bam_aux_append(b, "OS", 'A', 1, bsstrand); bsstrand[0] = '+'; d->n_corr++; } else if (conf->output_all_read) { printf("N\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); } } else if (!(c->flag & BAM_FUNMAP) && conf->infer_bsstrand) { char bss[3]; if (similarity(nG2A, nC2T) < 0.5) { strcpy(bss, "??"); } else if (nC2T > nG2A) { strcpy(bss, c->flag & BAM_FREVERSE ? "+-" : "++"); } else { strcpy(bss, c->flag & BAM_FREVERSE ? "-+" : "--"); } bam_aux_append(b, "ZS", 'Z', 3, (uint8_t*) bss); } if (out) samwrite(out, b); d->n_mapped++; return 0; }
/** * Gets the base in the read that is mapped to a genomic position. * Extracts the read sequence and aualities too. */ void bam_get_base_and_qual_and_read_and_qual(bam1_t *srec, uint32_t pos, char& base, char& qual, int32_t& rpos, kstring_t* readseq, kstring_t* readqual) { bam1_core_t *c = &srec->core; int32_t rlen = c->l_qseq; uint32_t cpos = c->pos; //reference coordinates of the first mapped base rpos = 0; //read coordinates kstring_t str; str.l = str.m = 0, str.s = 0; base = 'N'; qual = 0; if (c->n_cigar) { uint32_t *cigar = bam_get_cigar(srec); for (uint32_t i = 0; i < c->n_cigar; ++i) { char op = bam_cigar_opchr(cigar[i]); str.l = 0; kputw(bam_cigar_oplen(cigar[i]), &str); char* stop; uint32_t len = strtol(str.s, &stop, 10); assert(stop); if (op=='M') { if (pos>=cpos && pos<=cpos+len-1) { rpos += pos-cpos; break; } cpos += len; rpos += len; } else if (op=='D') { if (pos>=cpos && pos<=cpos+len-1) { rpos = -1; break; } cpos += len; } else if (op=='S' || op=='I') { rpos += len; } } //std::cout << "bpos " << bpos << "\n"; if (rpos>=0 && rpos<=rlen) { //sequence bam_get_seq_string(srec, readseq); base = readseq->s[rpos]; //qual bam_get_qual_string(srec, readqual); qual = readqual->s[rpos]; } else { rpos = BAM_READ_INDEX_NA; } } // std::cout << "b: " << base << "\n"; // std::cout << "q: " << s[bpos-1] << " " << q << "\n"; // for (uint32_t i = 0; i < c->l_qseq; ++i) std::cerr << ((char)(s[i] + 33)); };
static int trim_ns(bam1_t *b, void *data) { int ret = 0; opts_t *op((opts_t *)data); std::vector<uint8_t> aux(bam_get_aux(b), bam_get_aux(b) + bam_get_l_aux(b)); int tmp; uint8_t *const seq(bam_get_seq(b)); uint32_t *const cigar(bam_get_cigar(b)); //op->n_cigar = b->core.n_cigar; op->resize(b->l_data); // Make sure it's big enough to hold everything. memcpy(op->data, b->data, b->core.l_qname); // Get #Ns at the beginning for(tmp = 0; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; ++tmp); const int n_start(tmp); if(tmp == b->core.l_qseq - 1) // all bases are N -- garbage read ret |= op->skip_all_ns; // Get #Ns at the end for(tmp = b->core.l_qseq - 1; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; --tmp); const int n_end(b->core.l_qseq - 1 - tmp); // Get new length for read int final_len(b->core.l_qseq - n_end - n_start); if(final_len < 0) final_len = 0; if(final_len < op->min_trimmed_len) // Too short. ret |= 1; // Copy in qual and all of aux. if(n_end) { if((tmp = bam_cigar_oplen(cigar[b->core.n_cigar - 1]) - n_end) == 0) { LOG_DEBUG("Entire cigar operation is the softclip. Decrease the number of new cigar operations.\n"); --b->core.n_cigar; } else { LOG_DEBUG("Updating second cigar operation in-place.\n"); cigar[b->core.n_cigar - 1] = bam_cigar_gen(tmp, BAM_CSOFT_CLIP); } } // Get new n_cigar. if((tmp = bam_cigar_oplen(*cigar) - n_start) == 0) { memcpy(op->data + b->core.l_qname, cigar + 1, (--b->core.n_cigar) << 2); // << 2 for 4 bit per cigar op } else { if(n_start) *cigar = bam_cigar_gen(tmp, BAM_CSOFT_CLIP); memcpy(op->data + b->core.l_qname, cigar, b->core.n_cigar << 2); } uint8_t *opseq(op->data + b->core.l_qname + (b->core.n_cigar << 2)); // Pointer to the seq region of new data field. for(tmp = 0; tmp < final_len >> 1; ++tmp) opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4) | (bam_seqi(seq, (tmp << 1) + n_start + 1)); if(final_len & 1) opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4); tmp = bam_get_l_aux(b); memcpy(opseq + ((final_len + 1) >> 1), bam_get_qual(b) + n_start, final_len + tmp); // Switch data strings std::swap(op->data, b->data); b->core.l_qseq = final_len; memcpy(bam_get_aux(b), aux.data(), aux.size()); b->l_data = (bam_get_aux(b) - b->data) + aux.size(); if(n_end) bam_aux_append(b, "NE", 'i', sizeof(int), (uint8_t *)&n_end); if(n_start) bam_aux_append(b, "NS", 'i', sizeof(int), (uint8_t *)&n_start); const uint32_t *pvar((uint32_t *)dlib::array_tag(b, "PV")); tmp = b->core.flag & BAM_FREVERSE ? n_end: n_start; if(pvar) { std::vector<uint32_t>pvals(pvar + tmp, pvar + final_len + tmp); bam_aux_del(b, (uint8_t *)(pvar) - 6); dlib::bam_aux_array_append(b, "PV", 'I', sizeof(uint32_t), final_len, (uint8_t *)pvals.data()); } const uint32_t *fvar((uint32_t *)dlib::array_tag(b, "FA")); if(fvar) { std::vector<uint32_t>fvals(fvar + tmp, fvar + final_len + tmp); bam_aux_del(b, (uint8_t *)(fvar) - 6); dlib::bam_aux_array_append(b, "FA", 'I', sizeof(uint32_t), final_len, (uint8_t *)fvals.data()); } return ret; }
int do_grep() { #ifdef DEBUGa printf("[!]do_grep\n"); #endif BamInfo_t *pbam; kh_cstr_t BamID; khiter_t ki, bami; kstring_t ks1 = { 0, 0, NULL }; kstring_t ks2 = { 0, 0, NULL }; kstring_t ks3 = { 0, 0, NULL }; samFile *in; bam_hdr_t *h; hts_idx_t *idx; bam1_t *b, *d, *d2, *bR1, *bR2, *bR3; bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1(); //htsFile *out; //hts_opt *in_opts = NULL, *out_opts = NULL; int r = 0, exit_code = 0; kvec_t(bam1_t) R1, R2, RV; pierCluster_t *pierCluster; //samdat_t tmp_samdat; FILE *fs = fopen("./test.txt","w"); for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) { //printf(">[%d]:\n",bami); if (kh_exist(bamNFOp, bami)) { kv_init(R1); kv_init(R2); kv_init(RV); //tmp_samdat = (const samdat_t){ 0 }; //memset(&tmp_samdat,0,sizeof(samdat_t)); //printf("-[%d]:\n",bami); BamID = kh_key(bamNFOp, bami); pbam = &kh_value(bamNFOp, bami); fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD); in = sam_open(pbam->fileName, "r"); if (in == NULL) { fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } h = sam_hdr_read(in); /* out = hts_open("-", "w"); if (out == NULL) { fprintf(stderr, "[x]Error opening standard output\n"); return EXIT_FAILURE; } if (sam_hdr_write(out, h) < 0) { fprintf(stderr, "[!]Error writing output header.\n"); exit_code = 1; } */ int8_t *ChrIsHum; if (h == NULL) { fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } else { ChrIsHum = malloc(h->n_targets * sizeof(int8_t)); for (int32_t i=0; i < h->n_targets; ++i) { //ChrIsHum[i] = -1; ki = kh_get(chrNFO, chrNFOp, h->target_name[i]); if (ki == kh_end(chrNFOp)) { errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]); } else { ChrInfo_t * tmp = &kh_value(chrNFOp, ki); ChrIsHum[i] = tmp->isHum; //printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]); } } } h->ignore_sam_err = 0; b = bam_init1(); d = bam_init1(); d2 = bam_init1(); if ((idx = sam_index_load(in, pbam->fileName)) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } pierCluster = sam_plp_init(); while ((r = sam_read1(in, h, b)) >= 0) { int8_t flag = false; const bam1_core_t *c = &b->core; if (c->flag & BAM_FSECONDARY) continue; if (c->n_cigar) { uint32_t *cigar = bam_get_cigar(b); for (int i = 0; i < c->n_cigar; ++i) { if (bam_cigar_opchr(cigar[i])=='S') { // soft clipping if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) { flag = true; } } } } if (flag && ChrIsHum[c->tid]) { // Now, skip Virus items. //bam_copy1(bR1, b); flag = 0; // recycle //int enoughMapQ = 0; //kstring_t ks = { 0, 0, NULL }; /*if (sam_format1(h, b, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else*/ if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) { // Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况. //printf(">[%s]\n",ks_str(&ks1)); flag |= 1; //tmp_samdat.b = bam_dup1(b); //kv_push(samdat_t,R1,tmp_samdat); /*if (checkMapQ(ChrIsHum, b, true)) { ++enoughMapQ; }*/ } if (getPairedSam(in, idx, b, d) != 0) { flag &= ~1; continue; } else { flag |= 2; /*if (checkMapQ(ChrIsHum, d, false)) { ++enoughMapQ; }*/ /*if (c->flag & BAM_FSECONDARY) { if (getPairedSam(in, idx, d, d2) == 0) { //sam_format1(h, d2, &ks3); flag |= 4; if (checkMapQ(ChrIsHum, d2, false)) { ++enoughMapQ; } } }*/ } /* 对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。 >[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 353 chr2 13996555 0 50S40M chr18 48245109 0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:40 AS:i:40 XS:i:40 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0; YC:Z:CT YD:Z:f] -[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 177 chr18 48245109 9 40S50M gi|59585|emb|X04615.1|2000 0 GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:50 AS:i:50 XS:i:46 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0; YC:Z:GA YD:Z:f] +[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 113 gi|59585|emb|X04615.1| 2000 60 40S46M4S chr18 48245109 0 TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:46 AS:i:46 XS:i:27 RG:Z:Fsimout_mB SA:Z:fchr2,13996555,+,50S40M,0,0; YC:Z:CT YD:Z:r] */ /*if (sam_format1(h, d, &ks2) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; }*/ if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) { /*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1)); printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2)); if (flag & 4) { printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3)); } printf("<--%d\n",enoughMapQ);*/ if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) { //printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) sam_plp_push(ChrIsHum, pierCluster, d); //if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) sam_plp_push(ChrIsHum, pierCluster, d2); } else { //print fprintf(fs,"[%s]\nHumRange=%s:%d-%d\n", BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos); for (size_t i=0; i<kv_size(pierCluster->Reads);++i) { bam1_t *bi = kv_A(pierCluster->Reads, i); if (sam_format1(h, bi, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else { fprintf(fs,"%s\n",ks1.s); } } fprintf(fs,"\n"); //printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); //fflush(fs); sam_plp_dectroy(pierCluster); pierCluster = sam_plp_init(); } } } /*char *qname = bam_get_qname(b); if (sam_write1(out, h, b) < 0) { fprintf(stderr, "[x]Error writing output.\n"); exit_code = 1; break; }*/ } /* r = sam_close(out); // stdout can only be closed once if (r < 0) { fprintf(stderr, "Error closing output.\n"); exit_code = 1; } */ hts_idx_destroy(idx); bam_destroy1(b); bam_destroy1(d); bam_destroy1(d2); bam_hdr_destroy(h); r = sam_close(in); free(ChrIsHum); #ifdef DEBUGa fflush(NULL); //pressAnyKey(); #endif sam_plp_dectroy(pierCluster); //printf("<[%d]:\n",bami); } } fclose(fs); getPairedSam(NULL, NULL, NULL, NULL); // sam_close(fp2); //printf("---[%d]---\n",exit_code); bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3); ks_release(&ks1); ks_release(&ks2); ks_release(&ks3); return exit_code; }