static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((res = bam_read1(bs->fp, b)) >= 0) { uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; s = bam1_seq(b); q = bam1_qual(b); p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } if (bam1_strand(b)) { // then reverse seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)bam1_qname(b)); if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; }
void BWA::copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length) { // seq, rseq will ultimately be freed by bwa_cal_sa_reg_gap sequence->seq = new ubyte_t[read_length]; sequence->rseq = new ubyte_t[read_length]; for(unsigned i = 0; i < read_length; i++) sequence->seq[i] = nst_nt4_table[(unsigned)bases[i]]; memcpy(sequence->rseq,sequence->seq,read_length); // BWA expects the read bases to arrive reversed. seq_reverse(read_length,sequence->seq,0); seq_reverse(read_length,sequence->rseq,1); sequence->full_len = sequence->len = read_length; }
struct rld_t *fml_fmi_gen(int n, bseq1_t *seq, int is_mt) { mrope_t *mr; kstring_t str = {0,0,0}; mritr_t itr; rlditr_t di; const uint8_t *block; rld_t *e = 0; int k; for (k = 0; k < n; ++k) if (seq[k].l_seq > 0) break; if (k == n) return 0; mr = mr_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN, MR_SO_RCLO); for (k = 0; k < n; ++k) { int i; bseq1_t *s = &seq[k]; if (s->l_seq == 0) continue; free(s->qual); for (i = 0; i < s->l_seq; ++i) s->seq[i] = seq_nt6_table[(int)s->seq[i]]; for (i = 0; i < s->l_seq; ++i) if (s->seq[i] == 5) break; if (i < s->l_seq) { free(s->seq); continue; } if (is_rev_same(s->l_seq, s->seq)) --s->l_seq, s->seq[s->l_seq] = 0; seq_reverse(s->l_seq, (uint8_t*)s->seq); kputsn(s->seq, s->l_seq + 1, &str); seq_revcomp6(s->l_seq, (uint8_t*)s->seq); kputsn(s->seq, s->l_seq + 1, &str); free(s->seq); } free(seq); mr_insert_multi(mr, str.l, (uint8_t*)str.s, is_mt); free(str.s); e = rld_init(6, 3); rld_itr_init(e, &di, 0); mr_itr_first(mr, &itr, 1); while ((block = mr_itr_next_block(&itr)) != 0) { const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block); while (q < end) { int c = 0; int64_t l; rle_dec1(q, c, l); rld_enc(e, &di, l, c); } } rld_enc_finish(e, &di); mr_destroy(mr); return e; }
// Mostly stolen from bwa_read_bam. void bam1_to_seq(bam1_t *raw, bwa_seq_t *p, int is_comp, int trim_qual) { // long n_trimmed = 0; uint8_t *s, *q; int i, l = raw->core.l_qseq; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; // n_tot += p->full_len; s = bam1_seq(raw); q = bam1_qual(raw); p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } if (bam1_strand(raw)) { // then reverse seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) /* n_trimmed += */ bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->max_entries = 0 ; // We don't set a name, it's contained in the original record // anyway. // p->name = strdup((const char*)bam1_qname(raw)); // No place to put the tally right now. // if (n_seqs && trim_qual >= 1) // fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); }
bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; p->seq = (ubyte_t*)calloc(p->len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed. \n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); return 0; } return seqs; }
void BWA::generate_alignments_from_paths(const char* bases, const unsigned read_length, bwt_aln1_t* paths, const unsigned num_paths, const unsigned best_count, const unsigned second_best_count, Alignment*& alignments, unsigned& num_alignments) { bwa_seq_t* sequence = create_sequence(bases,read_length); sequence->aln = paths; sequence->n_aln = num_paths; // (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); // But overwrite key parts of the sequence in case the user passed back only a smaller subset // of the paths. sequence->c1 = best_count; sequence->c2 = second_best_count; sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; num_alignments = 0; for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++) num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1; alignments = new Alignment[num_alignments]; unsigned alignment_idx = 0; for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { // Stub in a 'working' path, so that only the desired alignment is local-aligned. const bwt_aln1_t* path = paths + path_idx; bwt_aln1_t working_path = *path; // Loop through all alignments, aligning each one individually. for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) { working_path.k = working_path.l = sa_idx; sequence->aln = &working_path; sequence->n_aln = 1; sequence->sa = sa_idx; sequence->strand = path->a; sequence->score = path->score; // Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse. // TODO: Fix the interface to bwa_refine_gapped so its easier to work with. if(alignment_idx > 0) seq_reverse(sequence->len, sequence->seq, 0); // Copy the local alignment data into the alignment object. *(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence); alignment_idx++; } } sequence->aln = NULL; sequence->n_aln = 0; bwa_free_read_seq(1,sequence); }
void bwa_rg_tpx(int iidx, const bntseq_t *bns, int n_seqs1, int n_seqs2, bwa_seq_t *seqs, ubyte_t *pacseq, bntseq_t *ntbns) { ubyte_t *ntpac = 0; int i, j; kstring_t *str; #ifdef _TIMING struct timeval st; uint64_t s1, e1; double pos1_time = 0.0; #endif #ifdef _TIMING gettimeofday(&st, NULL); s1 = st.tv_sec * 1000000L + (time_t)st.tv_usec; #endif for (i = n_seqs1; i < n_seqs2; ++i) { bwa_seq_t *s = seqs + i; seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! for (j = 0; j < s->n_multi; ++j) { bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap == 0) continue; q->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, (q->strand? 1 : -1) * q->gap, &n_cigar, 1); q->n_cigar = n_cigar; } if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); } if (ntbns) { // in color space for (i = n_seqs1; i < n_seqs2; ++i) { bwa_seq_t *s = seqs + i; bwa_cs2nt_core(s, bns->l_pac, ntpac); for (j = 0; j < s->n_multi; ++j) { bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap == 0) continue; free(q->cigar); q->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, (q->strand? 1 : -1) * q->gap, &n_cigar, 0); q->n_cigar = n_cigar; } if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again free(s->cigar); s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); } } } // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = n_seqs1; i < n_seqs2; ++i) { bwa_seq_t *s = seqs + i; if (s->type != BWA_TYPE_NO_MATCH) { int nm; s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, ntbns? ntpac : pacseq, str, &nm); s->nm = nm; } } free(str->s); free(str); // correct for trimmed reads if (!ntbns) // trimming is only enabled for Illumina reads for (i = n_seqs1; i < n_seqs2; ++i) bwa_correct_trimmed(seqs + i); #ifdef _TIMING gettimeofday(&st, NULL); e1 = st.tv_sec * 1000000L + (time_t)st.tv_usec; pos1_time = (double)((double)e1 - (double)s1) / 1000000.0; # ifdef HAVE_PTHREAD pthread_mutex_lock(&pe_lock); # endif // HAVE_PTHREAD fprintf(stderr,"bwapese1 time = %lf (sec)\n",pos1_time); # ifdef HAVE_PTHREAD pthread_mutex_unlock(&pe_lock); # endif // HAVE_PTHREAD #endif return; }
s_align* ssw_align (const s_profile* prof, const int8_t* ref, int32_t refLen, const uint8_t weight_gapO, const uint8_t weight_gapE, const uint8_t flag, // (from high to low) bit 5: return the best alignment beginning position; 6: if (ref_end1 - ref_begin1 <= filterd) && (read_end1 - read_begin1 <= filterd), return cigar; 7: if max score >= filters, return cigar; 8: always return cigar; if 6 & 7 are both setted, only return cigar when both filter fulfilled const uint16_t filters, const int32_t filterd, const int32_t maskLen) { alignment_end* bests = 0, *bests_reverse = 0; __m128i* vP = 0; int32_t word = 0, band_width = 0, readLen = prof->readLen; int8_t* read_reverse = 0; cigar* path; s_align* r = (s_align*)calloc(1, sizeof(s_align)); r->ref_begin1 = -1; r->read_begin1 = -1; r->cigar = 0; r->cigarLen = 0; if (maskLen < 15) { fprintf(stderr, "When maskLen < 15, the function ssw_align doesn't return 2nd best alignment information.\n"); } // Find the alignment scores and ending positions if (prof->profile_byte) { bests = sw_sse2_byte(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_byte, -1, prof->bias, maskLen); if (prof->profile_word && bests[0].score == 255) { free(bests); bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen); word = 1; } else if (bests[0].score == 255) { fprintf(stderr, "Please set 2 to the score_size parameter of the function ssw_init, otherwise the alignment results will be incorrect.\n"); free(r); return NULL; } }else if (prof->profile_word) { bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen); word = 1; }else { fprintf(stderr, "Please call the function ssw_init before ssw_align.\n"); free(r); return NULL; } r->score1 = bests[0].score; r->ref_end1 = bests[0].ref; r->read_end1 = bests[0].read; if (maskLen >= 15) { r->score2 = bests[1].score; r->ref_end2 = bests[1].ref; } else { r->score2 = 0; r->ref_end2 = -1; } free(bests); if (flag == 0 || (flag == 2 && r->score1 < filters)) goto end; // Find the beginning position of the best alignment. read_reverse = seq_reverse(prof->read, r->read_end1); if (word == 0) { vP = qP_byte(read_reverse, prof->mat, r->read_end1 + 1, prof->n, prof->bias); bests_reverse = sw_sse2_byte(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, prof->bias, maskLen); } else { vP = qP_word(read_reverse, prof->mat, r->read_end1 + 1, prof->n); bests_reverse = sw_sse2_word(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, maskLen); } free(vP); free(read_reverse); r->ref_begin1 = bests_reverse[0].ref; r->read_begin1 = r->read_end1 - bests_reverse[0].read; free(bests_reverse); if ((7&flag) == 0 || ((2&flag) != 0 && r->score1 < filters) || ((4&flag) != 0 && (r->ref_end1 - r->ref_begin1 > filterd || r->read_end1 - r->read_begin1 > filterd))) goto end; // Generate cigar. refLen = r->ref_end1 - r->ref_begin1 + 1; readLen = r->read_end1 - r->read_begin1 + 1; band_width = abs(refLen - readLen) + 1; path = banded_sw(ref + r->ref_begin1, prof->read + r->read_begin1, refLen, readLen, r->score1, weight_gapO, weight_gapE, band_width, prof->mat, prof->n); if (path == 0) { free(r); r = NULL; } else { r->cigar = path->seq; r->cigarLen = path->length; free(path); } end: return r; }
int bwa_read_seq1(bwa_seqio_t *bs, int iter, int tid, int thrds, bwa_seq_t **_seqs, int *n_avail, int mode, int trim_qual) { bwa_seq_t *p; bwa_seq_t *seqs = *_seqs; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; bool first; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) { fprintf (stderr, "IS BAM! --- Port bwa_read_bam function\n"); bwa_free_read_seq(*n_avail, seqs); // return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input exit(0); } if (*n_avail == 0) { if (*n_avail) bwa_free_read_seq(*n_avail, seqs); seqs = (bwa_seq_t*)calloc(READ_SEQ_SIZE, sizeof(bwa_seq_t)); *_seqs = seqs; *n_avail = READ_SEQ_SIZE; } n_seqs = 0; first = true; //err_fwrite("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", strlen("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"), 1, stdout); //long cksm = 0; while ((l = kseq_read1(seq, iter, tid, thrds, &first)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if(n_seqs > READ_SEQ_SIZE) { fprintf (stderr, "READ_SEQ_SIZE not big enough\n"); abort(); } init_bwa_seq_t(p); if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; //ComputeChecksum(seq->seq.s,seq->seq.l,&cksm); p->tid = -1; // no assigned to a thread p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; if (p->llen < p->len) { if(p->seq) free(p->seq); p->llen = p->len; p->seq = (ubyte_t*)calloc(p->len, 1); p->rseq = (ubyte_t*)calloc(p->full_len, 1); } for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality if(p->qual) free(p->qual); p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); if(p->name) free(p->name); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } //if (n_seqs == n_needed) break; if(kseq_end(seq)) break; } if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq1] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); //fprintf(stderr, "%d tid: %d sequences: %d cksum: %lx\n", iter, tid, n_seqs, cksm); return n_seqs; }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); #ifdef USE_HTSLIB while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) { #else while ((res = bam_read1(bs->fp, b)) >= 0) { #endif uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; #ifdef USE_HTSLIB s = bam_get_seq(b); q = bam_get_qual(b); #else s = bam1_seq(b); q = bam1_qual(b); #endif p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { #ifdef USE_HTSLIB p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)]; #else p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; #endif p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } #ifdef USE_HTSLIB if (bam_is_rev(b)) { // then reverse #else if (bam1_strand(b)) { // then reverse #endif seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); #ifdef USE_HTSLIB p->name = strdup((const char*)bam_get_qname(b)); #else p->name = strdup((const char*)bam1_qname(b)); #endif if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; } #define BARCODE_LOW_QUAL 13 bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); return 0; } return seqs; } void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) { int i, j; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; for (j = 0; j < p->n_multi; ++j) if (p->multi[j].cigar) free(p->multi[j].cigar); free(p->name); free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); free(p->cigar); } free(seqs); }
bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, unsigned int n_needed, unsigned int *n, int is_comp, int mid) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i; //, j; n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ( l > MAX_READ_LENGTH ) l = MAX_READ_LENGTH; //put a limit on sequence length p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; //p->len = l; p->len = l - mid; p->seq = (ubyte_t*)calloc(p->len, 1); for (i = 0; i != p->len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i+mid]]; p->rseq = (ubyte_t*)calloc(p->len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); /* printf("Options is_comp = opt->mode & BWA_MODE_COMPREAD is %d\n", is_comp); printf("Forward Sequence:"); int j; for ( j = 0; j < p->len; ++j) { printf("%d", p->seq[j]); } printf("\nReverse Sequence:"); for (j = 0; j < p->len; ++j) { printf("%d", p->rseq[j]); } printf("\n"); */ p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (seq->qual.l) // copy quality { //p->qual = (ubyte_t*)strdup((char*)seq->qual.s); p->qual = (ubyte_t*)calloc(p->len, 1); int i; for (i = 0; i != p->len; ++i) { p->qual[i] = seq->qual.s[mid+i]; } } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs == 0) { free(seqs); return 0; } return seqs; }
void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25]) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); int64_t k, beg, end; uint8_t *seq, *ref; int i; // compute the region start and end a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 if (h->is_rev == 0) { beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); if (beg < h->k) beg = h->k; end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); a->is_rev = 1; a->flag |= 16; } else { beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg); a->is_rev = 0; } if (beg < 1) beg = 1; if (end > l_pac) end = l_pac; if (end - beg < l_mseq) return; // generate the sequence seq = malloc(l_mseq + (end - beg)); ref = seq + l_mseq; for (k = beg; k < end; ++k) ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; if (h->is_rev == 0) { for (i = 0; i < l_mseq; ++i) { // on the reverse strand int c = nst_nt4_table[(int)mseq[i]]; seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c; } } else { for (i = 0; i < l_mseq; ++i) // on the forward strand seq[i] = nst_nt4_table[(int)mseq[i]]; } #ifndef _NO_SSE2 { ksw_query_t *q; ksw_aux_t aux[2]; // forward Smith-Waterman aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); ksw_sse2(q, end - beg, ref, &aux[0]); free(q); if (aux[0].score < opt->t) { free(seq); return; } ++aux[0].qe; ++aux[0].te; // reverse Smith-Waterman seq_reverse(aux[0].qe, seq, 0); seq_reverse(aux[0].te, ref, 0); q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat); ksw_sse2(q, aux[0].te, ref, &aux[1]); free(q); ++aux[1].qe; ++aux[1].te; // write output a->G = aux[0].score; a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; if (a->G2 < opt->t) a->G2 = 0; if (a->G2) a->flag |= BSW2_FLAG_TANDEM; a->k = beg + (aux[0].te - aux[1].te); a->len = aux[1].te; a->beg = aux[0].qe - aux[1].qe; a->end = aux[0].qe; } #else { AlnParam ap; path_t path[2]; int matrix[25]; for (i = 0; i < 25; ++i) matrix[i] = g_mat[i]; ap.gap_open = opt->q; ap.gap_ext = opt->r; ap.gap_end = opt->r; ap.matrix = matrix; ap.row = 5; ap.band_width = 50; a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2); if (a->G < opt->t) a->G = 0; if (a->G2 < opt->t) a->G2 = 0; a->k = beg + path[0].i - 1; a->len = path[1].i - path[0].i + 1; a->beg = path[0].j - 1; a->end = path[1].j; } #endif if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; free(seq); }
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2, const char *bwa_rg_id) { int j; //if (strcmp (p->name, "HWUSI-EAS1600:WT2_250_read_1:11_30_09:3:1:83:1066#0") == 0) //{ // fprintf (stderr, "found %s\n", p->name); //} if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { int seqid, nn, am = 0, flag = p->extra_flag; char XT; if (p->type == BWA_TYPE_NO_MATCH) { p->pos = mate->pos; p->strand = mate->strand; flag |= SAM_FSU; j = 1; } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment // get seqid nn = bns_coor_pac2real(bns, p->pos, j, &seqid); if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len) flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences // update flag and print it if (p->strand) flag |= SAM_FSR; if (mate) { if (mate->type != BWA_TYPE_NO_MATCH) { if (mate->strand) flag |= SAM_FMR; } else flag |= SAM_FMU; } printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); // print CIGAR if (p->cigar) { for (j = 0; j != p->n_cigar; ++j) printf("%d%c", __cigar_len(p->cigar[j]), "MIDSN"[__cigar_op(p->cigar[j])]); } else if (p->type == BWA_TYPE_NO_MATCH) printf("*"); else printf("%dM", p->len); // print mate coordinate if (mate && mate->type != BWA_TYPE_NO_MATCH) { int m_seqid, m_is_N; long long isize; am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality // redundant calculation here, but should not matter too much m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid); printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; if (p->type == BWA_TYPE_NO_MATCH) isize = 0; printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); } else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); else printf("\t*\t0\t0\t"); // print sequence and quality if (p->strand == 0) for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]); else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]); putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality printf("%s", p->qual); } else printf("*"); if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id); if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); if (p->type != BWA_TYPE_NO_MATCH) { int i; // calculate XT tag XT = "NURM"[p->type]; if (nn > 10) XT = 'N'; // print tags printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); // print XS tag, to be compatible with Cufflinks if (p->sense_strand != 2 ) printf("\tXS:A:%c", p->sense_strand ? '-':'+' ); else printf("\tXS:A:."); if (nn) printf("\tXN:i:%d", nn); if (mate) printf("\tSM:i:%lu\tAM:i:%d", p->seQ, am); if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment printf("\tX0:i:%lu", p->c1); if (p->c1 <= max_top2) printf("\tX1:i:%lu", p->c2); } printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo_t + p->n_gapo_q, p->n_gapo_t+p->n_gape_t+p->n_gapo_q+p->n_gape_q); if (p->md) printf("\tMD:Z:%s", p->md); // print multiple hits if (p->n_multi) { bool header_printed = 0; for (i = 0; i < p->n_multi; ++i) { bwt_multi1_t *q = p->multi + i; j = pos_end_multi(q, p->len) - q->pos; nn = bns_coor_pac2real(bns, q->pos, j, &seqid); if(pos_end_multi(q, p->len) - bns->anns[seqid].offset > bns->anns[seqid].len) continue; //the alignment bridges adjacent sequences (chroms) //TODO: need to avoid this at the first place in the junction discovery step, but this should be rare for mm or human if (! header_printed) { header_printed = 1; printf("\tXA:Z:"); } int k; printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', (int)(q->pos - bns->anns[seqid].offset + 1)); if (q->cigar) { for (k = 0; k < q->n_cigar; ++k) printf("%d%c", __cigar_len(q->cigar[k]), "MIDSN"[__cigar_op(q->cigar[k])]); } else printf("%dM", p->len); printf(",%d", q->nm); //q->gap_t + q->gap_q + q->mm); if (q->sense_strand != 2) printf(",%c;", q->sense_strand? '-' : '+' ); else printf(",.;"); } } } putchar('\n'); } else { // this read has no match ubyte_t *s = p->strand? p->rseq : p->seq; int flag = p->extra_flag | SAM_FSU; if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality printf("%s", p->qual); } else printf("*"); if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); putchar('\n'); } }
extern "C" bwa_seq_t * bwa_seed2genome_map(const char* read, int read_len, int strand, uint64_t *num, uint64_t *sa_k, uint64_t *sa_l) { bwa_seq_t *p=(bwa_seq_t*)calloc(1, sizeof(bwa_seq_t)) ; int n_seqs=1 ; int l = read_len ; //fprintf(stdout, "read=%s\n", read) ; p->tid = -1; // no assigned to a thread p->qual = NULL ; p->full_len = p->clip_len = p->len = l; p->seq = (ubyte_t*)calloc(p->len, 1); for (int i = 0; i != p->full_len; ++i) { p->seq[i] = nst_nt4_table[(int)read[i]]; //fprintf(stdout, "seq[%i]=%i\n", i, p->seq[i]) ; } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, bwt_opt->mode & BWA_MODE_COMPREAD); p->name = strdup("seq") ; p->cigar=NULL ; mybwa_cal_sa_reg_gap(0, bwt_bwt, n_seqs, p, bwt_opt); //fprintf(stdout, "n_aln=%i\n", p->n_aln) ; if (p->n_aln>0) { assert(p->n_aln<=2) ; if (p->aln[0].a==strand) { *sa_k=p->aln[0].k ; *sa_l=p->aln[0].l ; *num=*sa_l-*sa_k+1 ; } else if (p->n_aln>=2 && p->aln[1].a==strand) { *sa_k=p->aln[1].k ; *sa_l=p->aln[1].l ; *num=*sa_l-*sa_k+1 ; } else { *sa_k=1 ; *sa_l=0 ; *num=0 ; } //fprintf(stdout, "k=%lld, l=%lld\n", *sa_k, *sa_l) ; } else { *sa_k=1 ; *sa_l=0 ; *num=0 ; } /*p->sa = *sa_k ; p->c1 = 1 ; p->type=BWA_TYPE_UNIQUE ; mybwa_cal_pac_pos_core(bwt_bwt[0], bwt_bwt[1], p, 0, 0); int len = pos_end(p) - p->pos; int seq_id=0 ; bns_coor_pac2real(bwt_bns, p->pos, len, &seq_id) ; int pos = (int)(p->pos - bwt_bns->anns[seq_id].offset) ; fprintf(stdout, "seq_id=%i, pos=%i, n_aln=%i, multi=%i\n", seq_id, pos, p->n_aln, p->n_multi) ; */ return p ; }