static void copy_check_alignment(const char *infname, const char *informat, const char *outfname, const char *outmode, const char *outref) { samFile *in = sam_open(infname, "r"); samFile *out = sam_open(outfname, outmode); bam1_t *aln = bam_init1(); bam_hdr_t *header = NULL; int res; if (!in) { fail("couldn't open %s", infname); goto err; } if (!out) { fail("couldn't open %s with mode %s", outfname, outmode); goto err; } if (!aln) { fail("bam_init1() failed"); goto err; } if (outref) { if (hts_set_opt(out, CRAM_OPT_REFERENCE, outref) < 0) { fail("setting reference %s for %s", outref, outfname); goto err; } } header = sam_hdr_read(in); if (!header) { fail("reading header from %s", infname); goto err; } if (sam_hdr_write(out, header) < 0) fail("writing headers to %s", outfname); while ((res = sam_read1(in, header, aln)) >= 0) { int mod4 = ((intptr_t) bam_get_cigar(aln)) % 4; if (mod4 != 0) fail("%s CIGAR not 4-byte aligned; offset is 4k+%d for \"%s\"", informat, mod4, bam_get_qname(aln)); if (sam_write1(out, header, aln) < 0) fail("writing to %s", outfname); } if (res < -1) { fail("failed to read alignment from %s", infname); } err: bam_destroy1(aln); bam_hdr_destroy(header); if (in) sam_close(in); if (out) sam_close(out); }
// Function to compare reads and determine which one is < the other static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam_get_qname(a), bam_get_qname(b)); return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0))); } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam_is_rev(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam_is_rev(b))); }
// Function to compare reads in the heap and determine which one is < the other static inline int heap_lt(const heap1_t a, const heap1_t b) { if (g_is_by_qname) { int t; if (a.b == NULL || b.b == NULL) return a.b == NULL? 1 : 0; t = strnum_cmp(bam_get_qname(a.b), bam_get_qname(b.b)); return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0))); } else return __pos_cmp(a, b); }
const char* get_read_name(const bam1_t *b) { char* suffix[2] = {"/1", "/2"}; char* name = bam_get_qname(b); if (b->core.flag & BAM_FPAIRED) strcpy(name, suffix[get_read_idx(b)]); return name; }
Alignment bam_to_alignment(const bam1_t *b, map<string, string>& rg_sample) { Alignment alignment; // get the sequence and qual int32_t lqseq = b->core.l_qseq; string sequence; sequence.resize(lqseq); uint8_t* qualptr = bam_get_qual(b); string quality;//(lqseq, 0); quality.assign((char*)qualptr, lqseq); // process the sequence into chars uint8_t* seqptr = bam_get_seq(b); for (int i = 0; i < lqseq; ++i) { sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; } // get the read group and sample name uint8_t *rgptr = bam_aux_get(b, "RG"); char* rg = (char*) (rgptr+1); //if (!rg_sample string sname; if (!rg_sample.empty()) { sname = rg_sample[string(rg)]; } // Now name the read after the scaffold string read_name = bam_get_qname(b); // Decide if we are a first read (/1) or second (last) read (/2) if(b->core.flag & BAM_FREAD1) { read_name += "/1"; } if(b->core.flag & BAM_FREAD2) { read_name += "/2"; } // If we are marked as both first and last we get /1/2, and if we are marked // as neither the scaffold name comes through unchanged as the read name. // TODO: produce correct names for intermediate reads on >2 read scaffolds. // add features to the alignment alignment.set_name(read_name); alignment.set_sequence(sequence); alignment.set_quality(quality); // TODO: htslib doesn't wrap this flag for some reason. alignment.set_is_secondary(b->core.flag & BAM_FSECONDARY); if (sname.size()) { alignment.set_sample_name(sname); alignment.set_read_group(rg); } return alignment; }
void dump_read(bam1_t* b) { printf("->core.tid:(%d)\n", b->core.tid); printf("->core.pos:(%d)\n", b->core.pos); printf("->core.bin:(%d)\n", b->core.bin); printf("->core.qual:(%d)\n", b->core.qual); printf("->core.l_qname:(%d)\n", b->core.l_qname); printf("->core.flag:(%d)\n", b->core.flag); printf("->core.n_cigar:(%d)\n", b->core.n_cigar); printf("->core.l_qseq:(%d)\n", b->core.l_qseq); printf("->core.mtid:(%d)\n", b->core.mtid); printf("->core.mpos:(%d)\n", b->core.mpos); printf("->core.isize:(%d)\n", b->core.isize); if (b->data) { printf("->data:"); int i; for (i = 0; i < b->l_data; ++i) { printf("%x ", b->data[i]); } printf("\n"); } if (b->core.l_qname) { printf("qname: %s\n",bam_get_qname(b)); } if (b->core.l_qseq) { printf("qseq:"); int i; for (i = 0; i < b->core.l_qseq; ++i) { printf("%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]); } printf("\n"); printf("qual:"); for (i = 0; i < b->core.l_qseq; ++i) { printf("%c",bam_get_qual(b)[i]); } printf("\n"); } if (bam_get_l_aux(b)) { int i = 0; uint8_t* aux = bam_get_aux(b); while (i < bam_get_l_aux(b)) { printf("%.2s:%c:",aux+i,*(aux+i+2)); i += 2; switch (*(aux+i)) { case 'Z': while (*(aux+1+i) != '\0') { putc(*(aux+1+i), stdout); ++i; } break; } putc('\n',stdout); ++i;++i; } } printf("\n"); }
// Returns 0 to indicate read should be output 1 otherwise static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) { if (settings->remove_B) bam_remove_B(b); if (settings->min_qlen > 0) { int k, qlen = 0; uint32_t *cigar = bam_get_cigar(b); for (k = 0; k < b->core.n_cigar; ++k) if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) qlen += bam_cigar_oplen(cigar[k]); if (qlen < settings->min_qlen) return 1; } if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off)) return 1; if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); if ((double)(k&0xffffff) / 0x1000000 >= settings->subsam_frac) return 1; } if (settings->rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1)); if (k == kh_end(settings->rghash)) return 1; } } if (settings->library) { const char *p = bam_get_library((bam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; } if (settings->remove_aux_len) { size_t i; for (i = 0; i < settings->remove_aux_len; ++i) { uint8_t *s = bam_aux_get(b, settings->remove_aux[i]); if (s) { bam_aux_del(b, s); } } } return 0; }
Mapping(const bam_hdr_t * hdr_p, bam1_t * rec_p) : _rec_p(rec_p) { _query_name = bam_get_qname(rec_p); _flag = rec_p->core.flag; for (int i = 0; i < rec_p->core.l_qseq; ++i) { _seq += seq_nt16_str[bam_seqi(bam_get_seq(rec_p), i)]; } if (is_mapped()) { _chr_name = hdr_p->target_name[rec_p->core.tid]; _rf_start = rec_p->core.pos; _cigar = Cigar(bam_get_cigar(rec_p), rec_p->core.n_cigar); _rf_len = _cigar.rf_len(); } if (is_paired() and mp_is_mapped()) { _mp_chr_name = hdr_p->target_name[rec_p->core.mtid]; _mp_rf_start = rec_p->core.mpos; } }
// Transform a bam1_t record into a string with the FASTQ representation of it // @returns false for error, true for success static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) { int i; int32_t qlen = b->core.l_qseq; assert(qlen >= 0); uint8_t *seq; uint8_t *qual = bam_get_qual(b); const uint8_t *oq = NULL; if (state->use_oq) { oq = bam_aux_get(b, "OQ"); if (oq) oq++; // skip tag type } bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality linebuf->l = 0; // Write read name readpart readpart = which_readpart(b); kputc(state->filetype == FASTA? '>' : '@', linebuf); kputs(bam_get_qname(b), linebuf); // Add the /1 /2 if requested if (state->has12) { if (readpart == READ_1) kputs("/1", linebuf); else if (readpart == READ_2) kputs("/2", linebuf); } if (state->copy_tags) { for (i = 0; copied_tags[i]; ++i) { uint8_t *s; if ((s = bam_aux_get(b, copied_tags[i])) != 0) { kputc('\t', linebuf); kputsn(copied_tags[i], 2, linebuf); kputsn(":Z:", 3, linebuf); kputs(bam_aux2Z(s), linebuf); } } } kputc('\n', linebuf); seq = bam_get_seq(b); if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented for (i = qlen-1; i > -1; --i) { char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]]; kputc(c, linebuf); } } else { for (i = 0; i < qlen; ++i) { char c = seq_nt16_str[bam_seqi(seq,i)]; kputc(c, linebuf); } } kputc('\n', linebuf); if (state->filetype == FASTQ) { // Write quality kputs("+\n", linebuf); if (has_qual) { if (state->use_oq && oq) { if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented for (i = qlen-1; i > -1; --i) { kputc(oq[i], linebuf); } } else { kputs((char*)oq, linebuf); } } else { if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented for (i = qlen-1; i > -1; --i) { kputc(33 + qual[i], linebuf); } } else { for (i = 0; i < qlen; ++i) { kputc(33 + qual[i], linebuf); } } } } else { for (i = 0; i < qlen; ++i) { kputc(33 + state->def_qual, linebuf); } } kputc('\n', linebuf); } return true; }
static bool sam_fetch_coords(const CallFileEntry *centry, const char *flank5p, size_t flank5p_len, const char *flank3p, size_t flank3p_len, size_t *cpy_flnk_5p, size_t *cpy_flnk_3p, const read_t **chrom_ptr, size_t *start, size_t *end, bool *fw_strand_ptr) { // Get the next primary alignment do { if(sam_read1(samfh, bam_header, bamentry) < 0) die("We've run out of SAM entries!"); } while(bamentry->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); if(bamentry->core.flag & BAM_FUNMAP) { num_flank5p_unmapped++; return false; } if(bamentry->core.qual < min_mapq) { num_flank5p_lowqual++; return false; } bool fw_strand = !bam_is_rev(bamentry); *fw_strand_ptr = fw_strand; const char *chrom_name = bam_header->target_name[bamentry->core.tid]; const read_t *chrom = seq_fetch_chrom(genome, chrom_name); *chrom_ptr = chrom; const uint32_t *cigar = bam_get_cigar(bamentry); int cigar2rlen = bam_cigar2rlen(bamentry->core.n_cigar, cigar); // cpy_flnk_5p is soft clipped at right end of flank // Eat up hard masked (H), soft masked (S) and inserted bases (relative to ref) (I) *cpy_flnk_5p = bam_get_end_padding(bamentry->core.n_cigar, cigar); *cpy_flnk_3p = 0; // set this later // Get bam query name char *bname = bam_get_qname(bamentry); // Check entry/flank names match const char *hdrline = call_file_get_line(centry, 0); if(hdrline[0] != '>') die("Unexpected line: %s", hdrline); hdrline++; const char *hdrline_end = str_fasta_name_end(hdrline); int hdrline_len = hdrline_end - hdrline; if(strncmp(hdrline, bname, hdrline_len) != 0) die("SAM/BAM and call entries mismatch '%s' vs '%s'", hdrline, bname); // Find 3p flank position using search for first kmer char endkmer[200]; ctx_assert(kmer_size+1 <= sizeof(endkmer)); ctx_assert(flank3p_len >= kmer_size || call_file_min_allele_len(centry) == 0); bubble_get_end_kmer(flank5p, flank5p_len, flank3p, flank3p_len, kmer_size, endkmer); if(!fw_strand) dna_revcomp_str(endkmer, endkmer, kmer_size); // Determine search space // Choose a region of the ref to search for the end flank // end is index after last char long search_start, search_end; size_t longest_allele = call_file_max_allele_len(centry); if(fw_strand) { search_start = (long)bamentry->core.pos + cigar2rlen - kmer_size*2; search_end = (long)bamentry->core.pos + cigar2rlen + longest_allele + kmer_size*2 + 10; } else { search_start = (long)bamentry->core.pos - (longest_allele + kmer_size*2 + 10); search_end = (long)bamentry->core.pos + kmer_size*2; } search_start = MAX2(search_start, 0); search_end = MIN2(search_end, (long)chrom->seq.end); const char *search_region = chrom->seq.b + search_start; size_t search_len = (size_t)(search_end - search_start); // Now do search with kmer // Attempt to find perfect match for kmer within search region // Search, if there is more than one match -> abandon const char *kmer_match = ctx_strnstr(search_region, endkmer, search_len); if(kmer_match != NULL) { // Check for multiple hits size_t rem_search_len = search_region+search_len-kmer_match; if(ctx_strnstr(kmer_match+1, endkmer, rem_search_len-1) != NULL) { num_flank3p_multihits++; return false; } if(fw_strand) { *start = bamentry->core.pos + cigar2rlen; *end = kmer_match - chrom->seq.b; } else { *start = kmer_match + kmer_size - chrom->seq.b; *end = bamentry->core.pos; } num_flank3p_exact_match++; return true; } else { // Look for approximate match needleman_wunsch_align2(search_region, endkmer, search_len, kmer_size, &nw_scoring_flank, nw_aligner, aln); num_nw_flank++; const char *ref = aln->result_a, *alt = aln->result_b; // --aa--dd-cge // bb--ccd-ecge // Find positions of first and last match int i, l, r, matches = 0; int ref_offset_left = 0, ref_offset_rght = 0; int alt_offset_left = 0, alt_offset_rght = 0; for(l = 0; l < (int)aln->length && ref[l] != alt[l]; l++) { ref_offset_left += (ref[l] != '-'); alt_offset_left += (alt[l] != '-'); } for(r = aln->length-1; r > 0 && ref[r] != alt[r]; r--) { ref_offset_rght += (ref[r] != '-'); alt_offset_rght += (alt[r] != '-'); } // Count matches for(i = l; i <= r; i++) matches += (ref[i] == alt[i]); if(matches < (int)kmer_size / 2) { // flank doesn't map well num_flank3p_not_found++; return false; } num_flank3p_approx_match++; *cpy_flnk_3p += fw_strand ? alt_offset_left : alt_offset_rght; if(fw_strand) { *start = bamentry->core.pos + cigar2rlen; *end = search_region + ref_offset_left - chrom->seq.b; } else { *start = (search_region + search_len - ref_offset_rght) - chrom->seq.b; *end = bamentry->core.pos; } return true; } }
static int unpad_seq(bam1_t *b, kstring_t *s) { // Returns 0 on success, -1 on an error int k, j, i; int length; int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */ uint32_t *cigar = bam_get_cigar(b); uint8_t *seq = bam_get_seq(b); // b->core.l_qseq gives length of the SEQ entry (including soft clips, S) // We need the padded length after alignment from the CIGAR (excluding // soft clips S, but including pads from CIGAR D operations) length = bam_cigar2rlen(b->core.n_cigar, cigar); ks_resize(s, length); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j); } else if (op == BAM_CSOFT_CLIP) { j += ol; } else if (op == BAM_CHARD_CLIP) { /* do nothing */ } else if (op == BAM_CDEL) { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } else if (op == BAM_CREF_SKIP) { /* Treat CIGAR N as D (not ideal, but better than ignoring it) */ for (i = 0; i < ol; ++i) s->s[s->l++] = 0; if (0 == cigar_n_warning) { cigar_n_warning = -1; fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); } } else { fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); return -1; } } return length != s->l; }
BM_mappedRead * extractReads(char * bamFile, char ** contigs, int numContigs, uint16_t * groups, char * prettyName, int headersOnly, int minMapQual, int maxMisMatches, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { //----- // code uses the pattern outlined in samtools view (sam_view.c) // thanks lh3! // int i = 0; int result = -1; int hh = 0; int supp_check = 0x0; // include supp mappings if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // we need to let the users know if their pairings // will be corrupted int p_corrupt = 0; // helper variables samFile *in = 0; bam_hdr_t *header = NULL; bam1_t *b = bam_init1(); BM_mappedRead * root = 0; BM_mappedRead * prev = 0; // open file handlers if ((in = sam_open(bamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", bamFile); } else { // retrieve the header if ((header = sam_hdr_read(in)) == 0) { fprintf(stderr, "ERROR: Failed to read the header from \"%s\".\n", bamFile); } else { // check the index is intact hts_idx_t *idx = sam_index_load(in, bamFile); // load index if (idx == 0) { // index is unavailable fprintf(stderr, "ERROR: Random retrieval only works "\ "for indexed files.\n"); } else { cfuhash_table_t *pair_buffer = \ cfuhash_new_with_initial_size(1000000); cfuhash_set_flag(pair_buffer, CFUHASH_FROZEN_UNTIL_GROWS); for (hh = 0; hh < numContigs; ++hh) { // parse a region in the format like `chr2:100-200' hts_itr_t *iter = sam_itr_querys(idx, header, contigs[hh]); if (iter == NULL) { // reference name is not found fprintf(stderr, "WARNING: Could not find contig: "\ "[%s] in BAM: [%s].\n", contigs[hh], bamFile); } // fetch alignments int line = 0; while ((result = sam_itr_next(in, iter, b)) >= 0) { bam1_core_t core = b->core; line += 1; // only high quality?, primary? mappings if ( core.qual < minMapQual) continue; if ((core.flag & supp_check) != 0) continue; if(bam_aux2i(bam_aux_get(b, "NM")) > maxMisMatches) { continue; } char * seqId = bam_get_qname(b); char * seq = 0; char * qual = 0; int qual_len = 0; int seq_len = 0; // get sequence and quality if(0 == headersOnly) { // no point allocating unused space seq = calloc(core.l_qseq+1, sizeof(char)); qual = calloc(core.l_qseq+1, sizeof(char)); uint8_t *s = bam_get_seq(b); if (core.flag&BAM_FREVERSE) { // reverse the read int r = 0; for (i = core.l_qseq-1; i >=0 ; --i) { seq[r]="=TGKCYSBAWRDMHVN"[bam_seqi(s, i)]; ++r; } } else { for (i = 0; i < core.l_qseq; ++i) { seq[i]="=ACMGRSVTWYHKDBN"[bam_seqi(s, i)]; } } seq_len = core.l_qseq; s = bam_get_qual(b); if (s[0] != 0xff) { qual_len = core.l_qseq; for (i = 0; i < core.l_qseq; ++i) { qual[i] = (char)(s[i] + 33); } } else if (qual != 0) { free(qual); qual = 0; } } // work out pairing information uint8_t rpi = RPI_ERROR; if (core.flag&BAM_FPAIRED) { if(core.flag&BAM_FMUNMAP) { if (core.flag&BAM_FREAD1) { rpi = RPI_SNGL_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SNGL_SEC; } } else { if (core.flag&BAM_FREAD1) { rpi = RPI_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SEC; } } } else { rpi = RPI_SNGL; } // make the funky Id #define MAX_SEQ_ID_LEN 80 char * seq_id = calloc(MAX_SEQ_ID_LEN, sizeof(char)); // allocate the string to the buffer but check to // ensure we're not cutting anything off int id_len = snprintf(seq_id, MAX_SEQ_ID_LEN, "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); if(id_len >= MAX_SEQ_ID_LEN) { seq_id = calloc(id_len+1, sizeof(char)); snprintf(seq_id, id_len+1, // don't forget the NULL! "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); } // make the mapped read struct prev = makeMappedRead(seq_id, seq, qual, id_len, seq_len, qual_len, rpi, groups[hh], prev); if (0 == root) { root = prev; } if(rpi == RPI_SNGL || \ rpi == RPI_SNGL_FIR || \ rpi == RPI_SNGL_SEC) { // we can just add away // indicate singleton reads by pointing the // partner pointer to itself prev->partnerRead = prev; } else { // RPI_FIR or RPI_SEC // work out pairing information using the hash // we append a 1 or 2 to the end so that // we don't accidentally pair 1's with 1's etc. char * stripped_result; if(rpi == RPI_FIR) { stripped_result = \ pairStripper(seqId, core.l_qname-1, '2'); } else { stripped_result = \ pairStripper(seqId, core.l_qname-1, '1'); } char * stripped = seqId; if(stripped_result) stripped = stripped_result; //fprintf(stdout, "SEARCH %s\n", stripped); // now stripped always holds a stripped value // see if it is in the hash already BM_mappedRead * stored_MR = \ cfuhash_get(pair_buffer, stripped); if (0 != stored_MR) { // exists in the hash -> Add the pair info if(rpi == RPI_FIR) { prev->partnerRead = stored_MR; } else { stored_MR->partnerRead = prev; } // delete the entry from the hash cfuhash_delete(pair_buffer, stripped); } else { // we should put it in the hash // make sure to change it into something // we will find next time if(rpi == RPI_FIR) stripped[strlen(stripped)-1] = '1'; else stripped[strlen(stripped)-1] = '2'; // check to make sure we're not overwriting // anything important. cfuhash overwrites // duplicate entries, so we need to grab // it and put it to "SNGL_XXX" before we // lose the pointer BM_mappedRead * OWMMR = \ cfuhash_put(pair_buffer, stripped, prev); if(OWMMR) { if(OWMMR->rpi == RPI_FIR) OWMMR->rpi = RPI_SNGL_FIR; else OWMMR->rpi = RPI_SNGL_SEC; OWMMR->partnerRead = OWMMR; printPairCorruptionWarning(p_corrupt); p_corrupt = 1; } } if(stripped_result != 0) { // free this! free(stripped_result); stripped_result = 0; } } } hts_itr_destroy(iter); if (result < -1) { fprintf(stderr, "ERROR: retrieval of reads from "\ "contig: \"%s\" failed due to "\ "truncated file or corrupt BAM index "\ "file\n", header->target_name[hh]); break; } } // any entries left in the hash are pairs whose mates did // not meet quality standards size_t key_size = 0; char * key; BM_mappedRead * LOMMR; size_t pr_size = 1; if(cfuhash_each_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)) { do { // get the mapped read // update it's pairing so we know it's really single if (LOMMR->rpi == RPI_FIR) LOMMR->rpi = RPI_SNGL_FIR; else if (LOMMR->rpi == RPI_SEC) LOMMR->rpi = RPI_SNGL_SEC; // indicate singleton reads by pointing the // partner pointer to itself LOMMR->partnerRead = LOMMR; } while(cfuhash_next_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)); } cfuhash_clear(pair_buffer); cfuhash_destroy(pair_buffer); } hts_idx_destroy(idx); // destroy the BAM index } } // always do this if (in) sam_close(in); bam_destroy1(b); if ( header ) bam_hdr_destroy(header); return root; }
char* get_pair_name(const bam1_t *b) { return bam_get_qname(b); }
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; int32_t old_nm_i = -1, nm = 0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++u; } else { kputw(u, str); kputc(ref[x+j], str); u = 0; ++nm; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { kputw(u, str); kputc('^', str); for (j = 0; j < l; ++j) { if (x+j >= ref_len || ref[x+j] == '\0') break; kputc(ref[x+j], str); } u = 0; x += j; nm += j; if (j < l) break; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { y += l; if (op == BAM_CINS) nm += l; } else if (op == BAM_CREF_SKIP) { x += l; } } kputw(u, str); // apply max_nm if (max_nm > 0 && nm >= max_nm) { for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match seq[z/2] |= (z&1)? 0x0f : 0xf0; bam_get_qual(b)[z] = 0; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; } } // update NM if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_nm = bam_aux_get(b, "NM"); if (old_nm) old_nm_i = bam_aux2i(old_nm); if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); else if (nm != old_nm_i) { fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } } // update MD if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_md = bam_aux_get(b, "MD"); if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); else { int is_diff = 0; if (strlen((char*)old_md+1) == str->l) { for (i = 0; i < str->l; ++i) if (toupper(old_md[i+1]) != toupper(str->s[i])) break; if (i < str->l) is_diff = 1; } else is_diff = 1; if (is_diff) { fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } } } // drop all tags but RG if (flag&DROP_TAG) { uint8_t *q = bam_aux_get(b, "RG"); bam_aux_drop_other(b, q); } // reduce the resolution of base quality if (flag&BIN_QUAL) { uint8_t *qual = bam_get_qual(b); for (i = 0; i < b->core.l_qseq; ++i) if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; } free(str->s); free(str); }
int main_bam2fq(int argc, char *argv[]) { BGZF *fp, *fpse = 0; bam1_t *b; uint8_t *buf; int max_buf, c, has12 = 0; kstring_t str; int64_t n_singletons = 0, n_reads = 0; char last[512], *fnse = 0; while ((c = getopt(argc, argv, "as:")) > 0) if (c == 'a') has12 = 1; else if (c == 's') fnse = optarg; if (argc == optind) { fprintf(stderr, "\nUsage: bam2fq [-a] [-s outSE] <in.bam>\n\n"); fprintf(stderr, "Options: -a append /1 and /2 to the read name\n"); fprintf(stderr, " -s FILE write singleton reads to FILE [assume single-end]\n"); fprintf(stderr, "\n"); return 1; } fp = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r"); assert(fp); bam_hdr_destroy(bam_hdr_read(fp)); buf = 0; max_buf = 0; str.l = str.m = 0; str.s = 0; last[0] = 0; if (fnse) fpse = bgzf_open(fnse, "w1"); b = bam_init1(); while (bam_read1(fp, b) >= 0) { int i, qlen = b->core.l_qseq, is_print = 0; uint8_t *qual, *seq; if (b->flag&BAM_FSECONDARY) continue; // skip secondary alignments ++n_reads; if (fpse) { if (str.l && strcmp(last, bam_get_qname(b))) { bgzf_write(fpse, str.s, str.l); str.l = 0; ++n_singletons; } if (str.l) is_print = 1; strcpy(last, bam_get_qname(b)); } else is_print = 1; qual = bam_get_qual(b); kputc(qual[0] == 0xff? '>' : '@', &str); kputsn(bam_get_qname(b), b->core.l_qname - 1, &str); if (has12) { kputc('/', &str); kputw(b->core.flag>>6&3, &str); } kputc('\n', &str); if (max_buf < qlen + 1) { max_buf = qlen + 1; kroundup32(max_buf); buf = (uint8_t*)realloc(buf, max_buf); } buf[qlen] = 0; seq = bam_get_seq(b); for (i = 0; i < qlen; ++i) buf[i] = bam_seqi(seq, i); // copy the sequence if (bam_is_rev(b)) { // reverse complement for (i = 0; i < qlen>>1; ++i) { int8_t t = seq_comp_table[buf[qlen - 1 - i]]; buf[qlen - 1 - i] = seq_comp_table[buf[i]]; buf[i] = t; } if (qlen&1) buf[i] = seq_comp_table[buf[i]]; } for (i = 0; i < qlen; ++i) buf[i] = seq_nt16_str[buf[i]]; kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (qual[0] != 0xff) { kputsn("+\n", 2, &str); for (i = 0; i < qlen; ++i) buf[i] = 33 + qual[i]; if (bam_is_rev(b)) { // reverse for (i = 0; i < qlen>>1; ++i) { uint8_t t = buf[qlen - 1 - i]; buf[qlen - 1 - i] = buf[i]; buf[i] = t; } } } kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (is_print) { fwrite(str.s, 1, str.l, stdout); str.l = 0; } } if (fpse) { if (str.l) { bgzf_write(fpse, str.s, str.l); ++n_singletons; } fprintf(stderr, "[M::%s] discarded %lld singletons\n", __func__, (long long)n_singletons); bgzf_close(fpse); } fprintf(stderr, "[M::%s] processed %lld reads\n", __func__, (long long)n_reads); free(buf); free(str.s); bam_destroy1(b); bgzf_close(fp); return 0; }
// Update the training data with aligned events from a read void add_aligned_events(const Fast5Map& name_map, const faidx_t* fai, const bam_hdr_t* hdr, const bam1_t* record, size_t read_idx, int region_start, int region_end, size_t round, ModelTrainingMap& training) { // Load a squiggle read for the mapped read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); // load read SquiggleRead sr(read_name, fast5_path); // replace the models that are built into the read with the current trained model sr.replace_models(opt::trained_model_type); for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { // skip if 1D reads and this is the wrong strand if(!sr.has_events_for_strand(strand_idx)) { continue; } // set k uint32_t k = sr.pore_model[strand_idx].k; // Align to the new model EventAlignmentParameters params; params.sr = &sr; params.fai = fai; params.hdr = hdr; params.record = record; params.strand_idx = strand_idx; params.alphabet = mtrain_alphabet; params.read_idx = read_idx; params.region_start = region_start; params.region_end = region_end; std::vector<EventAlignment> alignment_output = align_read_to_ref(params); if (alignment_output.size() == 0) return; // Update pore model based on alignment std::string curr_model = sr.pore_model[strand_idx].metadata.get_short_name(); double orig_score = -INFINITY; if (opt::output_scores) { orig_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL); #pragma omp critical(print) std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Original " << orig_score << std::endl; } if ( opt::calibrate ) { double resid = 0.; recalibrate_model(sr, strand_idx, alignment_output, mtrain_alphabet, resid, true); if (opt::output_scores) { double rescaled_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL); #pragma omp critical(print) { std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Rescaled " << rescaled_score << std::endl; std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Delta " << rescaled_score-orig_score << std::endl; } } } // Get the training data for this model auto& emission_map = training[curr_model]; for(size_t i = 0; i < alignment_output.size(); ++i) { const EventAlignment& ea = alignment_output[i]; std::string model_kmer = ea.model_kmer; // Grab the previous/next model kmer from the alignment_output table. // If the read is from the same strand as the reference // the next kmer comes from the next alignment_output (and vice-versa) // other the indices are swapped int next_stride = ea.rc ? -1 : 1; std::string prev_kmer = ""; std::string next_kmer = ""; if(i > 0 && i < alignment_output.size() - 1) { // check that the event indices are correct for the next expected position assert(alignment_output[i + next_stride].event_idx - ea.event_idx == 1); assert(alignment_output[i - next_stride].event_idx - ea.event_idx == -1); // only set the previous/next when there was exactly one base of movement along the referenc if( std::abs(alignment_output[i + next_stride].ref_position - ea.ref_position) == 1) { next_kmer = alignment_output[i + next_stride].model_kmer; } if( std::abs(alignment_output[i - next_stride].ref_position - ea.ref_position) == 1) { prev_kmer = alignment_output[i - next_stride].model_kmer; } } // Get the rank of the kmer that we aligned to (on the sequencing strand, = model_kmer) uint32_t rank = mtrain_alphabet->kmer_rank(model_kmer.c_str(), k); assert(rank < emission_map.size()); auto& kmer_summary = emission_map[rank]; // We only use this event for training if its not at the end of the alignment // (to avoid bad alignments around the read edges) and if its not too short (to // avoid bad measurements from effecting the levels too much) bool use_for_training = i > opt::min_distance_from_alignment_end && i + opt::min_distance_from_alignment_end < alignment_output.size() && alignment_output[i].hmm_state == 'M' && sr.get_duration( alignment_output[i].event_idx, strand_idx) >= opt::min_event_duration && sr.get_fully_scaled_level(alignment_output[i].event_idx, strand_idx) >= 1.0; if(use_for_training) { StateTrainingData std(sr, ea, rank, prev_kmer, next_kmer); #pragma omp critical(kmer) kmer_summary.events.push_back(std); } if(ea.hmm_state == 'M') { #pragma omp atomic kmer_summary.num_matches += 1; } else if(ea.hmm_state == 'E') { #pragma omp atomic kmer_summary.num_stays += 1; } } } // for strands }
static void bam_translate(bam1_t* b, trans_tbl_t* tbl) { // Update target id if not unmapped tid if ( b->core.tid >= 0 ) { b->core.tid = tbl->tid_trans[b->core.tid]; } if ( b->core.mtid >= 0 ) { b->core.mtid = tbl->tid_trans[b->core.mtid]; } // If we have a RG update it uint8_t *rg = bam_aux_get(b, "RG"); if (rg) { char* decoded_rg = bam_aux2Z(rg); khiter_t k = kh_get(c2c, tbl->rg_trans, decoded_rg); if (k != kh_end(tbl->rg_trans)) { char* translate_rg = kh_value(tbl->rg_trans,k); bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', strlen(translate_rg) + 1, (uint8_t*)translate_rg); } else { fprintf(pysamerr, "[bam_translate] RG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_rg, bam_get_qname(b)); bam_aux_del(b, rg); } } // If we have a PG update it uint8_t *pg = bam_aux_get(b, "PG"); if (pg) { char* decoded_pg = bam_aux2Z(pg); khiter_t k = kh_get(c2c, tbl->pg_trans, decoded_pg); if (k != kh_end(tbl->pg_trans)) { char* translate_pg = kh_value(tbl->pg_trans,k); bam_aux_del(b, pg); bam_aux_append(b, "PG", 'Z', strlen(translate_pg) + 1, (uint8_t*)translate_pg); } else { fprintf(pysamerr, "[bam_translate] PG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_pg, bam_get_qname(b)); bam_aux_del(b, pg); } } }
int main(int argc, char **argv) { if (argc < 4) errx(1, "usage\t:%s <bam> <split out> <discord out> (optional #threads)", argv[0]); char *bam_file_name = argv[1]; char *split_file_name = argv[2]; char *disc_file_name = argv[3]; int threads = 2; if (argc == 5) { threads = atoi(argv[4]); } samFile *disc = sam_open(disc_file_name, "wb"); samFile *split = sam_open(split_file_name, "wb"); samFile *in = sam_open(bam_file_name, "rb"); if(in == NULL) errx(1, "Unable to open BAM/SAM file."); // TODO: handle cram. if (threads > 1) { bgzf_mt(in->fp.bgzf, threads, 256); } hts_idx_t *idx = sam_index_load(in, bam_file_name); if(idx == NULL) errx(1,"Unable to open BAM/SAM index."); bam_hdr_t *hdr = sam_hdr_read(in); int r = sam_hdr_write(disc, hdr); r = sam_hdr_write(split, hdr); bam1_t *aln = bam_init1(); int ret; while(ret = sam_read1(in, hdr, aln) >= 0) { if (((aln->core.flag) & 1294) == 0) r = sam_write1(disc, hdr, aln); uint8_t *sa = bam_aux_get(aln, "SA"); if (sa != 0) { char *sa_tag = strdup(bam_aux2Z(sa)); if ( count_tags(sa_tag) == 1) { char *chrm, strand, *cigar; uint32_t pos; split_sa_tag(sa_tag, &chrm, &pos, &strand, &cigar); struct line sa, al; calcOffsets(cigar, pos, strand, &sa); sa.chrm = chrm; sa.strand = strand; calcAlnOffsets(bam_get_cigar(aln), aln->core.n_cigar, aln->core.pos, bam_is_rev(aln) ? '-' : '+', &al); al.chrm = hdr->target_name[aln->core.tid]; al.strand = bam_is_rev(aln) ? '-' : '+'; struct line *left = &al, *right = &sa; if (left->SQO > right->SQO) { left = &sa; right = &al; } int overlap = MAX(1 + MIN(left->EQO, right->EQO) - MAX(left->SQO, right->SQO), 0); int alen1 = 1 + left->EQO - left->SQO; int alen2 = 1 + right->EQO - right->SQO; int mno = MIN(alen1-overlap, alen2-overlap); if (mno < MIN_NON_OVERLAP) continue; if ( (strcmp(left->chrm, right->chrm) == 0) && (left->strand == right->strand) ) { int leftDiag, rightDiag, insSize; if (left->strand == '-') { leftDiag = left->rapos - left->sclip; rightDiag = (right->rapos + right->raLen) - (right->sclip + right->qaLen); insSize = rightDiag - leftDiag; } else { leftDiag = (left->rapos + left->raLen) - (left->sclip + left->qaLen); rightDiag = right->rapos - right->sclip; insSize = leftDiag - rightDiag; } int desert = right->SQO - left->EQO - 1; if ((abs(insSize) < MIN_INDEL_SIZE) || ((desert > 0) && ( (desert - (int)MAX(0, insSize)) > MAX_UNMAPPED_BASES))) continue; } char *qname = bam_get_qname(aln); if ((aln->core.flag & 64) == 64) qname[0] = 'A'; else qname[0] = 'B'; r = sam_write1(split, hdr, aln); } free(sa_tag); } } bam_destroy1(aln); hts_idx_destroy(idx); bam_hdr_destroy(hdr); sam_close(in); sam_close(disc); sam_close(split); if(ret < -1) { errx(1, "lumpy_filter: error reading bam: %s\n", bam_file_name); } }
// Test CpG sites in this read for methylation void calculate_methylation_for_read(const ModelMap& model_map, const Fast5Map& name_map, const faidx_t* fai, const bam_hdr_t* hdr, const bam1_t* record, size_t read_idx, const OutputHandles& handles) { // Load a squiggle read for the mapped read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); SquiggleRead sr(read_name, fast5_path); // An output map from reference positions to scored CpG sites std::map<int, ScoredSite> site_score_map; for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { std::vector<double> site_scores; std::vector<int> site_starts; std::vector<int> site_ends; std::vector<int> site_count; // replace the baked-in pore model with the methylation model // (including unmethylated kmers) for this strand std::string curr_model = sr.pore_model[strand_idx].name; std::string methyl_model = curr_model + ".ecoli_er2925.pcr_MSssI.timp.021216.alphabet_cpg.model"; auto model_iter = model_map.find(methyl_model); if(model_iter != model_map.end()) { sr.pore_model[strand_idx].update_states( model_iter->second ); } else { fprintf(stderr, "Error, methylated model %s not found\n", methyl_model.c_str()); exit(EXIT_FAILURE); } size_t k = sr.pore_model[strand_idx].k; // Align in event space using the new model EventAlignmentParameters params; params.sr = &sr; params.fai = fai; params.hdr = hdr; params.record = record; params.strand_idx = strand_idx; params.read_idx = read_idx; params.alphabet = mtest_alphabet; std::vector<EventAlignment> alignment_output = align_read_to_ref(params); if(alignment_output.empty()) continue; std::string contig = alignment_output.front().ref_name.c_str(); // Convert the EventAlignment to a map between reference positions and events std::vector<AlignedPair> event_aligned_pairs; for(size_t i = 0; i < alignment_output.size(); ++i) { AlignedPair ap = { alignment_output[i].ref_position, alignment_output[i].event_idx }; event_aligned_pairs.push_back(ap); } int ref_start_pos = event_aligned_pairs.front().ref_pos; int ref_end_pos = event_aligned_pairs.back().ref_pos; // Extract the reference sequence for this region int fetched_len = 0; assert(ref_end_pos >= ref_start_pos); std::string ref_seq = get_reference_region_ts(params.fai, contig.c_str(), ref_start_pos, ref_end_pos, &fetched_len); // Remove non-ACGT bases from this reference segment ref_seq = gDNAAlphabet.disambiguate(ref_seq); // Scan the sequence for CpGs std::vector<int> cpg_sites; assert(ref_seq.size() != 0); for(size_t i = 0; i < ref_seq.size() - 1; ++i) { if(ref_seq[i] == 'C' && ref_seq[i+1] == 'G') { cpg_sites.push_back(i); } } // Batch the CpGs together into groups that are separated by some minimum distance int min_separation = 10; size_t curr_idx = 0; while(curr_idx < cpg_sites.size()) { // Find the endpoint of this group of sites size_t end_idx = curr_idx + 1; while(end_idx < cpg_sites.size()) { if(cpg_sites[end_idx] - cpg_sites[end_idx - 1] > min_separation) break; end_idx += 1; } // the coordinates on the reference substring for this group of sites int sub_start_pos = cpg_sites[curr_idx] - min_separation; int sub_end_pos = cpg_sites[end_idx - 1] + min_separation; if(sub_start_pos > min_separation && cpg_sites[end_idx - 1] - cpg_sites[curr_idx] < 200) { std::string subseq = ref_seq.substr(sub_start_pos, sub_end_pos - sub_start_pos + 1); std::string rc_subseq = mtest_alphabet->reverse_complement(subseq); // using the reference-to-event map, look up the event indices for this segment AlignedPairRefLBComp lb_comp; AlignedPairConstIter start_iter = std::lower_bound(event_aligned_pairs.begin(), event_aligned_pairs.end(), sub_start_pos + ref_start_pos, lb_comp); AlignedPairConstIter stop_iter = std::lower_bound(event_aligned_pairs.begin(), event_aligned_pairs.end(), sub_end_pos + ref_start_pos, lb_comp); // Only process this region if the the read is aligned within the boundaries // and the span between the start/end is not unusually short if(start_iter != event_aligned_pairs.end() && stop_iter != event_aligned_pairs.end() && abs(start_iter->read_pos - stop_iter->read_pos) > 10) { uint32_t hmm_flags = HAF_ALLOW_PRE_CLIP | HAF_ALLOW_POST_CLIP; // Set up event data HMMInputData data; data.read = &sr; data.anchor_index = -1; // unused data.strand = strand_idx; data.rc = alignment_output.front().rc; data.event_start_idx = start_iter->read_pos; data.event_stop_idx = stop_iter->read_pos; data.event_stride = data.event_start_idx <= data.event_stop_idx ? 1 : -1; // Calculate the likelihood of the unmethylated sequence HMMInputSequence unmethylated(subseq, rc_subseq, mtest_alphabet); double unmethylated_score = profile_hmm_score(unmethylated, data, hmm_flags); // Methylate all CpGs in the sequence and score again std::string mcpg_subseq = mtest_alphabet->methylate(subseq); std::string rc_mcpg_subseq = mtest_alphabet->reverse_complement(mcpg_subseq); // Calculate the likelihood of the methylated sequence HMMInputSequence methylated(mcpg_subseq, rc_mcpg_subseq, mtest_alphabet); double methylated_score = profile_hmm_score(methylated, data, hmm_flags); // Aggregate score int start_position = cpg_sites[curr_idx] + ref_start_pos; auto iter = site_score_map.find(start_position); if(iter == site_score_map.end()) { // insert new score into the map ScoredSite ss; ss.chromosome = contig; ss.start_position = start_position; ss.end_position = cpg_sites[end_idx - 1] + ref_start_pos; ss.n_cpg = end_idx - curr_idx; // extract the CpG site(s) with a k-mers worth of surrounding context size_t site_output_start = cpg_sites[curr_idx] - k + 1; size_t site_output_end = cpg_sites[end_idx - 1] + k; ss.sequence = ref_seq.substr(site_output_start, site_output_end - site_output_start); // insert into the map iter = site_score_map.insert(std::make_pair(start_position, ss)).first; } // set strand-specific score // upon output below the strand scores will be summed iter->second.ll_unmethylated[strand_idx] = unmethylated_score; iter->second.ll_methylated[strand_idx] = methylated_score; } } curr_idx = end_idx; } } // for strands #pragma omp critical(methyltest_write) { // these variables are sums over all sites within a read double ll_ratio_sum_strand[2] = { 0.0f, 0.0f }; double ll_ratio_sum_both = 0; size_t num_positive = 0; // write all sites for this read for(auto iter = site_score_map.begin(); iter != site_score_map.end(); ++iter) { const ScoredSite& ss = iter->second; double sum_ll_m = ss.ll_methylated[0] + ss.ll_methylated[1]; double sum_ll_u = ss.ll_unmethylated[0] + ss.ll_unmethylated[1]; double diff = sum_ll_m - sum_ll_u; num_positive += diff > 0; fprintf(handles.site_writer, "%s\t%d\t%d\t", ss.chromosome.c_str(), ss.start_position, ss.end_position); fprintf(handles.site_writer, "ReadIdx=%zu;", read_idx); fprintf(handles.site_writer, "LogLikMeth=%.2lf;LogLikUnmeth=%.2lf;LogLikRatio=%.2lf;", sum_ll_m, sum_ll_u, diff); fprintf(handles.site_writer, "LogLikMethByStrand=%.2lf,%.2lf;", ss.ll_methylated[0], ss.ll_methylated[1]); fprintf(handles.site_writer, "LogLikUnmethByStrand=%.2lf,%.2lf;", ss.ll_unmethylated[0], ss.ll_unmethylated[1]); fprintf(handles.site_writer, "NumCpGs=%d;Sequence=%s\n", ss.n_cpg, ss.sequence.c_str()); ll_ratio_sum_strand[0] += ss.ll_methylated[0] - ss.ll_unmethylated[0]; ll_ratio_sum_strand[1] += ss.ll_methylated[1] - ss.ll_unmethylated[1]; ll_ratio_sum_both += diff; } std::string complement_model = sr.pore_model[C_IDX].name; fprintf(handles.read_writer, "%s\t%.2lf\t%zu\t%s\tNumPositive=%zu\n", fast5_path.c_str(), ll_ratio_sum_both, site_score_map.size(), complement_model.c_str(), num_positive); for(size_t si = 0; si < NUM_STRANDS; ++si) { std::string model = sr.pore_model[si].name; fprintf(handles.strand_writer, "%s\t%.2lf\t%zu\t%s\n", fast5_path.c_str(), ll_ratio_sum_strand[si], site_score_map.size(), model.c_str()); } } }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); #ifdef USE_HTSLIB while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) { #else while ((res = bam_read1(bs->fp, b)) >= 0) { #endif uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; #ifdef USE_HTSLIB s = bam_get_seq(b); q = bam_get_qual(b); #else s = bam1_seq(b); q = bam1_qual(b); #endif p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { #ifdef USE_HTSLIB p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)]; #else p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; #endif p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } #ifdef USE_HTSLIB if (bam_is_rev(b)) { // then reverse #else if (bam1_strand(b)) { // then reverse #endif seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); #ifdef USE_HTSLIB p->name = strdup((const char*)bam_get_qname(b)); #else p->name = strdup((const char*)bam1_qname(b)); #endif if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; } #define BARCODE_LOW_QUAL 13 bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); return 0; } return seqs; } void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) { int i, j; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; for (j = 0; j < p->n_multi; ++j) if (p->multi[j].cigar) free(p->multi[j].cigar); free(p->name); free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); free(p->cigar); } free(seqs); }
static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state) { bam1_t* b = bam_init1(); char *current_qname = NULL; int64_t n_reads = 0, n_singletons = 0; // Statistics kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; int score[3]; int at_eof; if (b == NULL ) { perror("[bam2fq_mainloop_singletontrack] Malloc error for bam record buffer."); return false; } bool valid = true; while (true) { at_eof = sam_read1(state->fp, state->h, b) < 0; if (!at_eof && filter_it_out(b, state)) continue; if (!at_eof) ++n_reads; if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { if (current_qname) { if (score[1] > 0 && score[2] > 0) { // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] if (fputs(linebuf[1].s, state->fpr[1]) == EOF) { valid = false; break; } if (fputs(linebuf[2].s, state->fpr[2]) == EOF) { valid = false; break; } } else if (score[1] > 0 || score[2] > 0) { // print whichever one exists to fpse if (score[1] > 0) { if (fputs(linebuf[1].s, state->fpse) == EOF) { valid = false; break; } } else { if (fputs(linebuf[2].s, state->fpse) == EOF) { valid = false; break; } } ++n_singletons; } if (score[0]) { // TODO: check this // print linebuf[0] to fpr[0] if (fputs(linebuf[0].s, state->fpr[0]) == EOF) { valid = false; break; } } } if (at_eof) break; free(current_qname); current_qname = strdup(bam_get_qname(b)); score[0] = score[1] = score[2] = 0; } // Prefer a copy of the read that has base qualities int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; if (b_score > score[which_readpart(b)]) { if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) { fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__); return false; } score[which_readpart(b)] = b_score; } } if (!valid) { perror("[bam2fq_mainloop_singletontrack] Error writing to FASTx files."); } bam_destroy1(b); free(current_qname); free(linebuf[0].s); free(linebuf[1].s); free(linebuf[2].s); fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); return valid; }
static int CreateCoverageMap(const char* const refName, const char* const bamName, hashtable* const reference) { int status; samFile *in = sam_open(bamName, "r"); bam_hdr_t *hdr = NULL; if (in == NULL) { status = EXIT_FAILURE; return 0; } hdr = sam_hdr_read(in); if (hdr == NULL) { status = EXIT_FAILURE; goto clean; } int ret; bam1_t *b = bam_init1(); u64 numread = 0; // number of reads analyzed while ((ret = sam_read1(in, hdr, b)) >= 0) { numread += 1; if ((numread % 10000000) == 0) { fprintf(stderr, "Processed %"PRIu64" reads\n", numread); } if (1 == debug_flag) { fprintf(stderr, "Read name : %s\n", bam_get_qname(b)); } // ignore if this is a zero length read (have seen it in some cases // where the reads were clipped by another tool. also ignore all // secondary or supplementary or QC failed alignments for now. // ignore unmapped reads as well if (b->core.l_qseq == 0) continue; if (((b->core.flag & 0x4) == 0x4) || ((b->core.flag & 0x100) == 0x100) || ((b->core.flag & 0x200) == 0x200) || ((b->core.flag & 0x400) == 0x400) || ((b->core.flag & 0x800) == 0x800)){ continue; } // if this is paired, then I register one vote for the fragment. if ((b->core.flag & 0x1) == 0x1) { if ((b->core.flag & 0x40) == 0x40) { chrcoverage* cov = must_find_hashtable(reference, hdr->target_name[b->core.tid], strlen(hdr->target_name[b->core.tid])); cov->cov[b->core.pos] += 1; if(cov->cov[b->core.pos] == 251) cov->cov[b->core.pos] = 250; } } else { chrcoverage* cov = must_find_hashtable(reference, hdr->target_name[b->core.tid], strlen(hdr->target_name[b->core.tid])); cov->cov[b->core.pos] += 1; if(cov->cov[b->core.pos] == 251) cov->cov[b->core.pos] = 250; } } clean: if (hdr != NULL) bam_hdr_destroy(hdr); if (hts_close(in) != 0) status = EXIT_FAILURE; //bin* iter; //bin* next; // //u64 sum = 0; //u64 num = 0; //for(int i = 0; i < reference->size; i++){ // iter = reference->bins[i]; // while(iter){ // next = iter->next; // chrcoverage* chrcov = (chrcoverage*)iter->val; // for (u64 j = 0; j < chrcov->length; j++) { // if (chrcov->cov[j] > 0) { // printf("%s\t%"PRIu64"\t%d\n", iter->name, j, chrcov->cov[j]); // } // } // iter = next; // } //} return status; }
// currently, this function ONLY works if each read has one hit static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct) { bam_hdr_t *header; bam1_t *b[2]; int curr, has_prev, pre_end = 0, cur_end = 0; kstring_t str; str.l = str.m = 0; str.s = 0; header = sam_hdr_read(in); if (header == NULL) { fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); exit(1); } // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { char *p, *q; p = strstr(header->text, "\tSO:coordinate"); q = strchr(header->text, '\n'); // Looking for SO:coordinate within the @HD line only // (e.g. must ignore in a @CO comment line later in header) if ((p != 0) && (p < q)) { fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); exit(1); } } sam_hdr_write(out, header); b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while (sam_read1(in, header, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.flag & BAM_FSECONDARY) { if ( !remove_reads ) sam_write1(out, header, cur); continue; // skip secondary alignments } if (cur->core.flag & BAM_FSUPPLEMENTARY) { sam_write1(out, header, cur); continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from) } if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag { cur->core.flag |= BAM_FUNMAP; } if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end { cur_end = bam_endpos(cur); // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; } if (has_prev) { // do we have a pair of reads to examine? if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name pre->core.flag |= BAM_FPAIRED; cur->core.flag |= BAM_FPAIRED; sync_mate(pre, cur); if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (add_ct) bam_template_cigar(pre, cur, &str); // TODO: Add code to properly check if read is in a proper pair based on ISIZE distribution if (proper_pair_check && !plausibly_properly_paired(pre,cur)) { pre->core.flag &= ~BAM_FPROPER_PAIR; cur->core.flag &= ~BAM_FPROPER_PAIR; } // Write out result if ( !remove_reads ) { sam_write1(out, header, pre); sam_write1(out, header, cur); } else { // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(!(pre->core.flag&BAM_FUNMAP)) sam_write1(out, header, pre); if(!(cur->core.flag&BAM_FUNMAP)) sam_write1(out, header, cur); } has_prev = 0; } else { // unpaired? clear bad info and write it out if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped pre->core.flag |= BAM_FUNMAP; pre->core.tid = -1; pre->core.pos = -1; } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) sam_write1(out, header, pre); } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired bam1_t *pre = b[1-curr]; if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped pre->core.flag |= BAM_FUNMAP; pre->core.tid = -1; pre->core.pos = -1; } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); sam_write1(out, header, pre); } bam_hdr_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); }
/****************************************************************************** * * The main worker node function. * * int thread_id: the thread_id * char *fastq1: FIFO from which bowtie2 can get read1 * char *fastq2: FIFO from which bowtie2 can get read2 (if it exists) * *******************************************************************************/ void herd_worker_node(int thread_id, char *fastq1, char *fastq2) { int cmd_length = 1, max_qname = 0, status, strand; char *cmd, *last_qname = calloc(1, sizeof(char)); MPI_Header *packed_header; MPI_read *packed_read = calloc(1, sizeof(MPI_read)); bam_hdr_t *header; bam1_t *read1 = bam_init1(); bam1_t *read2 = bam_init1(); samFile *fp; #ifdef DEBUG MPI_Status stat; int current_p_size = 100; htsFile *of; bam_hdr_t *debug_header = bam_hdr_init(); bam1_t *debug_read = bam_init1(); global_header = bam_hdr_init(); void *p = calloc(100,1); char *oname = NULL; #else int i = 0; #endif time_t t0, t1; int swapped = 0; assert(last_qname); assert(packed_read); //Which strand should we be aligning to? if(config.directional) { strand = (thread_id-1) % 2; } else { strand = (thread_id-1) % 4; } packed_read->size = 0; packed_read->packed = NULL; //construct the bowtie2 command cmd_length += (int) strlen("bowtie2 -q --reorder") + 1; cmd_length += (int) strlen(config.bowtie2_options) + 1; cmd_length += (int) strlen("--norc -x") + 1; cmd_length += (int) strlen(config.genome_dir) + strlen("bisulfite_genome/CT_conversion/BS_CT") + 1; cmd_length += (int) 2*(strlen("-1 ") + strlen(fastq1)) + 3; if(config.paired) cmd_length += (int) strlen(fastq2); //This is likely unneeded. #ifdef DEBUG oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_X.bam"))); assert(oname); sprintf(oname, "%s%s_%i.bam", config.odir, config.basename, thread_id); if(!config.quiet) fprintf(stderr, "Writing output to %s\n", oname); of = sam_open(oname, "wb"); free(oname); #endif cmd = (char *) malloc(sizeof(char) * cmd_length); assert(cmd); if(strand == 0) { //OT Read#1 C->T, Read#2 G->A, Genome C->T only the + strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 1) { //OB Read#1 C->T, Read#2 G->A, Genome G->A only the - strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 2) { //CTOT Read#1 G->A, Read#2 C->T, Genome C->T, only the - strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 3) { //CTOB Read#1 G->A, Read#2 C->T, Genome G->A, only the + strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else { fprintf(stderr, "Oh shit, got strand %i!\n", strand); return; } //Start the process if(!config.quiet) fprintf(stderr, "Node %i executing: %s\n", thread_id, cmd); fflush(stderr); fp = sam_popen(cmd); header = sam_hdr_read(fp); #ifdef DEBUG sam_hdr_write(of, header); #endif #ifndef DEBUG packed_header = pack_header(header); if(thread_id == 1) { //Send the header MPI_Send((void *) &(packed_header->size), 1, MPI_INT, 0, 1, MPI_COMM_WORLD); status = MPI_Send((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD); if(status != MPI_SUCCESS) { fprintf(stderr, "MPI_Send returned %i\n", status); fflush(stderr); } } #else packed_header = pack_header(header); void *tmp_pointer = malloc(packed_header->size); assert(tmp_pointer); MPI_Request request; MPI_Isend((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &request); status = MPI_Recv(tmp_pointer, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &stat); if(status != MPI_SUCCESS) fprintf(stderr, "We seem to have not been able to send the message to ourselves!\n"); MPI_Wait(&request, &stat); unpack_header(debug_header, tmp_pointer); global_header = debug_header; free(tmp_pointer); #endif t0 = time(NULL); if(!config.quiet) fprintf(stderr, "Node %i began sending reads @%s", thread_id, ctime(&t0)); fflush(stderr); while(sam_read1(fp, header, read1) >= 0) { #ifdef DEBUG sam_write1(of, global_header, read1); #endif if(strcmp(bam_get_qname(read1), last_qname) == 0) { //Multimapper if(config.paired) { sam_read1(fp, header, read2); #ifdef DEBUG sam_write1(of, global_header, read2); #endif } continue; } else { if(read1->core.l_qname > max_qname) { max_qname = read1->core.l_qname + 10; last_qname = realloc(last_qname, sizeof(char) * max_qname); assert(last_qname); } strcpy(last_qname, bam_get_qname(read1)); } //Are paired-end reads in the wrong order? swapped = 0; if(config.paired) { if(read1->core.flag & BAM_FREAD2) { swapped = 1; sam_read1(fp, header, read2); packed_read = pack_read(read2, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else sam_write1(of, global_header, read2); if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); debug_read = unpack_read(debug_read, p); #endif } } //Send the read packed_read = pack_read(read1, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend(packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); #endif //Deal with paired-end reads if(config.paired && !swapped) { sam_read1(fp, header, read2); packed_read = pack_read(read2, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else sam_write1(of, global_header, read2); if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); debug_read = unpack_read(debug_read, p); #endif } #ifndef DEBUG i++; #endif } t1 = time(NULL); if(!config.quiet) fprintf(stderr, "Node %i finished sending reads @%s\t(%f sec elapsed)\n", thread_id, ctime(&t1), difftime(t1, t0)); fflush(stderr); //Notify the master node packed_read->size = 0; #ifndef DEBUG void *A = malloc(1); assert(A); MPI_Send(A, 1, MPI_BYTE, 0, 5, MPI_COMM_WORLD); free(A); #endif //Close things up bam_hdr_destroy(header); bam_destroy1(read1); bam_destroy1(read2); free(cmd); if(packed_read->packed != NULL) free(packed_read->packed); free(packed_read); if(packed_header->packed != NULL) free(packed_header->packed); free(packed_header); free(last_qname); sam_pclose(fp); //Remove the FIFO(s) unlink(fastq1); if(config.paired) unlink(fastq2); #ifdef DEBUG sam_close(of); bam_hdr_destroy(debug_header); bam_destroy1(debug_read); free(p); #endif if(!config.quiet) fprintf(stderr, "Exiting worker node %i\n", thread_id); fflush(stderr); };
SEXP extract_pair_data(SEXP bam, SEXP index, SEXP chr, SEXP start, SEXP end, SEXP mapq, SEXP dedup, SEXP diagnostics) try { // Checking input values. if (!isInteger(mapq) || LENGTH(mapq)!=1) { throw std::runtime_error("mapping quality should be an integer scalar"); } const int minqual=asInteger(mapq); if (!isLogical(dedup) || LENGTH(dedup)!=1) { throw std::runtime_error("duplicate removal should be a logical scalar"); } const bool rmdup=asLogical(dedup); if (!isLogical(diagnostics) || LENGTH(diagnostics)!=1) { throw std::runtime_error("diagnostics specification should be a logical scalar"); } const bool getnames=asLogical(diagnostics); // Initializing odds and ends. BamFile bf(bam, index); BamRead br; BamIterator biter(bf, chr, start, end); OutputContainer oc(getnames); typedef std::map<std::pair<int, std::string>, AlignData> Holder; std::deque<Holder> all_holders(4); // four holders, one for each strand/first combination; cut down searches. std::pair<int, std::string> current; Holder::iterator ith; int curpos, mate_pos; AlignData algn_data; bool am_mapped, is_first; bool mate_is_in; std::set<std::string> identical_pos; std::set<std::string>::iterator itip; int last_identipos=-1; while (bam_itr_next(bf.in, biter.iter, br.read) >= 0){ ++oc.totals; curpos = (br.read->core).pos + 1; // Getting 1-indexed position. br.extract_data(algn_data); am_mapped=br.is_well_mapped(minqual, rmdup); /* Reasons to not add a read: */ // // If we can see that it is obviously unmapped (IMPOSSIBLE for a sorted file). // if (((br.read -> core).flag & BAM_FUNMAP)!=0) { // // We don't filter by additional mapping criteria, as we need to search 'holder' to pop out the partner and to store diagnostics. // continue; // } // If it's a singleton. if (((br.read -> core).flag & BAM_FPAIRED)==0) { if (am_mapped) { oc.add_single(curpos, algn_data); } continue; } // Or, if we can see that its partner is obviously unmapped. if (((br.read -> core).flag & BAM_FMUNMAP)!=0) { if (am_mapped) { oc.add_onemapped(curpos, algn_data); } continue; } // Or if it's inter-chromosomal. is_first=(((br.read->core).flag & BAM_FREAD1)!=0); if (is_first==(((br.read->core).flag & BAM_FREAD2)!=0)) { std::stringstream err; err << "read '" << bam_get_qname(br.read) << "' must be either first or second in the pair"; throw std::runtime_error(err.str()); } if ((br.read -> core).mtid!=(br.read -> core).tid) { if (am_mapped) { oc.add_interchr(curpos, algn_data, bam_get_qname(br.read), is_first); } continue; } /* Checking the map and adding it if it doesn't exist. */ current.second.assign(bam_get_qname(br.read)); mate_pos = (br.read -> core).mpos + 1; // 1-indexed position, again. mate_is_in=false; if (mate_pos < curpos) { mate_is_in=true; } else if (mate_pos == curpos) { // Identical mpos to curpos needs careful handling to figure out whether we've already seen it. if (curpos!=last_identipos) { identical_pos.clear(); last_identipos=curpos; } itip=identical_pos.lower_bound(current.second); if (itip!=identical_pos.end() && !(identical_pos.key_comp()(current.second, *itip))) { mate_is_in=true; identical_pos.erase(itip); } else { identical_pos.insert(itip, current.second); } } if (mate_is_in) { current.first = mate_pos; Holder& holder=all_holders[int(!is_first) + 2*int(bam_is_mrev(br.read))]; ith=holder.find(current); if (ith != holder.end()) { if (!am_mapped) { // Searching to pop out the mate, to reduce the size of 'holder' for the remaining searches (and to store diagnostics). oc.add_onemapped((ith->first).first, ith->second); holder.erase(ith); continue; } oc.add_genuine(curpos, algn_data, (ith->first).first, ith->second, is_first); holder.erase(ith); } else if (am_mapped) { // Only possible if the mate didn't get added because 'am_mapped' was false. oc.add_onemapped(curpos, algn_data); } } else if (am_mapped) { current.first = curpos; Holder& holder=all_holders[int(is_first) + 2*int(algn_data.is_reverse)]; holder[current] = algn_data; } } // Leftovers treated as one_unmapped; marked as paired, but the mate is not in file. for (size_t h=0; h<all_holders.size(); ++h) { Holder& holder=all_holders[h]; for (ith=holder.begin(); ith!=holder.end(); ++ith) { oc.add_onemapped((ith->first).first, ith->second); } holder.clear(); } // Storing all output. SEXP output=PROTECT(allocVector(VECSXP, getnames ? 9 : 2)); try { SET_VECTOR_ELT(output, 0, allocVector(VECSXP, 2)); SEXP left=VECTOR_ELT(output, 0); store_int_output(left, 0, oc.forward_pos_out); store_int_output(left, 1, oc.forward_len_out); SET_VECTOR_ELT(output, 1, allocVector(VECSXP, 2)); SEXP right=VECTOR_ELT(output, 1); store_int_output(right, 0, oc.reverse_pos_out); store_int_output(right, 1, oc.reverse_len_out); if (getnames) { SET_VECTOR_ELT(output, 2, ScalarInteger(oc.totals)); SET_VECTOR_ELT(output, 3, allocVector(VECSXP, 2)); SEXP singles=VECTOR_ELT(output, 3); store_int_output(singles, 0, oc.single_pos); store_int_output(singles, 1, oc.single_len); SET_VECTOR_ELT(output, 4, allocVector(VECSXP, 2)); SEXP first=VECTOR_ELT(output, 4); store_int_output(first, 0, oc.ufirst_pos); store_int_output(first, 1, oc.ufirst_len); SET_VECTOR_ELT(output, 5, allocVector(VECSXP, 2)); SEXP second=VECTOR_ELT(output, 5); store_int_output(second, 0, oc.usecond_pos); store_int_output(second, 1, oc.usecond_len); SET_VECTOR_ELT(output, 6, allocVector(VECSXP, 2)); SEXP onemap=VECTOR_ELT(output, 6); store_int_output(onemap, 0, oc.onemap_pos); store_int_output(onemap, 1, oc.onemap_len); SET_VECTOR_ELT(output, 7, allocVector(VECSXP, 3)); SEXP interchr1=VECTOR_ELT(output, 7); store_int_output(interchr1, 0, oc.ifirst_pos); store_int_output(interchr1, 1, oc.ifirst_len); store_names(interchr1, 2, oc.interchr_names_1); SET_VECTOR_ELT(output, 8, allocVector(VECSXP, 3)); SEXP interchr2=VECTOR_ELT(output, 8); store_int_output(interchr2, 0, oc.isecond_pos); store_int_output(interchr2, 1, oc.isecond_len); store_names(interchr2, 2, oc.interchr_names_2); } } catch (std::exception &e) { UNPROTECT(1); throw; } UNPROTECT(1); return output; } catch (std::exception &e) { return mkString(e.what()); }
orientation = O_FF; } key->single = 1; key->this_ref = this_ref; key->this_coord = this_coord; key->orientation = orientation; } /* Add the duplicate name to a hash if it does not exist. */ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe) { khiter_t d; int ret; d = kh_get(duplicates, d_hash, bam_get_qname(dupe)); if (d == kh_end(d_hash)) { d = kh_put(duplicates, d_hash, strdup(bam_get_qname(dupe)), &ret); if (ret > 0) { kh_value(d_hash, d) = 1; } else if (ret == 0) { kh_value(d_hash, d)++; } else { fprintf(stderr, "[markdup] error: unable to store supplementary duplicates.\n"); return 1; } } return 0;
int scorereads_main(int argc, char** argv) { parse_scorereads_options(argc, argv); omp_set_num_threads(opt::num_threads); Fast5Map name_map(opt::reads_file); ModelMap models; if (!opt::models_fofn.empty()) models = read_models_fofn(opt::models_fofn); // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for schedule(dynamic) for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { //load read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); SquiggleRead sr(read_name, fast5_path); // TODO: early exit when have processed all of the reads in readnames if (!opt::readnames.empty() && std::find(opt::readnames.begin(), opt::readnames.end(), read_name) == opt::readnames.end() ) continue; for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { std::vector<EventAlignment> ao = alignment_from_read(sr, strand_idx, read_idx, models, fai, hdr, record, clip_start, clip_end); if (ao.size() == 0) continue; // Update pore model based on alignment if ( opt::calibrate ) recalibrate_model(sr, strand_idx, ao, false); double score = model_score(sr, strand_idx, fai, ao, 500); if (score > 0) continue; #pragma omp critical(print) std::cout << read_name << " " << ( strand_idx ? "complement" : "template" ) << " " << sr.pore_model[strand_idx].name << " " << score << std::endl; } } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } } while(result >= 0); // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); return 0; }
static bool split(state_t* state) { if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) { fprintf(pysamerr, "Could not write output file header\n"); return false; } size_t i; for (i = 0; i < state->output_count; i++) { if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) { fprintf(pysamerr, "Could not write output file header\n"); return false; } } bam1_t* file_read = bam_init1(); // Read the first record if (sam_read1(state->merged_input_file, state->merged_input_header, file_read) < 0) { // Nothing more to read? Ignore this file bam_destroy1(file_read); file_read = NULL; } while (file_read != NULL) { // Get RG tag from read and look it up in hash to find file to output it to uint8_t* tag = bam_aux_get(file_read, "RG"); khiter_t iter; if ( tag != NULL ) { char* rg = bam_aux2Z(tag); iter = kh_get_c2i(state->rg_hash, rg); } else { iter = kh_end(state->rg_hash); } // Write the read out to correct file if (iter != kh_end(state->rg_hash)) { // if found write to the appropriate untangled bam int i = kh_val(state->rg_hash,iter); sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read); } else { // otherwise write to the unaccounted bam if there is one or fail if (state->unaccounted_file == NULL) { if (tag) { fprintf(pysamerr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag)); } else { fprintf(pysamerr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read)); } bam_destroy1(file_read); return false; } else { sam_write1(state->unaccounted_file, state->unaccounted_header, file_read); } } // Replace written read with the next one to process if (sam_read1(state->merged_input_file, state->merged_input_header, file_read) < 0) { // Nothing more to read? Ignore this file in future bam_destroy1(file_read); file_read = NULL; } } return true; }
/* * process one BAM record, and store accumulated results in 'results' */ int seqchksum_processRecord(bam1_t *rec, HASH_TYPE hash, chksum_results_t *results) { uint32_t crc = 0; uint16_t aflags = rec->core.flag; uint8_t *seq = get_read(rec); uint8_t *qual = get_quality(rec); uint16_t flag_mask = BAM_FPAIRED | BAM_FREAD1 | BAM_FREAD2; uint8_t flags = (aflags & flag_mask) & 0xFF; bool pass = !(aflags & BAM_FQCFAIL);; char *qname = bam_get_qname(rec); uint8_t *tag; char *rgid; HashItem *hi; HashData hd; int newitem; digest_line_t *dline_all; digest_line_t *dline; // look up the RG tag tag = bam_aux_get(rec, "RG"); //hd.p = malloc(sizeof(digest_line_t)); if (tag) rgid = bam_aux2Z(tag); else rgid = ""; hd.p = NULL; hi = HashTableAdd(results->rgHash, rgid, 0, hd, &newitem); if (newitem) { hi->data.p = malloc(sizeof(digest_line_t)); dline = hi->data.p; init_digest_line(hash,dline); } else { dline = hi->data.p; } dline_all = &(results->all); // flags + sequence chksum update_crc(&crc,&flags,1); update_crc(&crc,seq,strlen((char*)seq)); update_digest_line(hash, pass, dline, crc, 0); update_digest_line(hash, pass, dline_all, crc, 0); // flags + sequence + quality chksum (don't reset crc, just add quality) update_crc(&crc,qual,strlen((char*)qual)); update_digest_line(hash, pass, dline, crc, 2); update_digest_line(hash, pass, dline_all, crc, 2); // name + flags + sequence chksum crc = 0; update_crc(&crc, (uint8_t *)qname, strlen(qname)+1); update_crc(&crc, &flags, 1); update_crc(&crc,seq,strlen((char*)seq)); update_digest_line(hash, pass, dline, crc, 1); update_digest_line(hash, pass, dline_all, crc, 1); // flags + sequence + tags chksum crc = 0; update_crc(&crc, &flags, 1); update_crc(&crc,seq,strlen((char*)seq)); tag = bam_aux_get(rec,"BC"); if (tag) update_crc(&crc,tag-2,aux_type2size(tag)+3); tag = bam_aux_get(rec,"FI"); if (tag) update_crc(&crc,tag-2,aux_type2size(tag)+3); tag = bam_aux_get(rec,"QT"); if (tag) update_crc(&crc,tag-2,aux_type2size(tag)+3); tag = bam_aux_get(rec,"RT"); if (tag) update_crc(&crc,tag-2,aux_type2size(tag)+3); tag = bam_aux_get(rec,"TC"); if (tag) update_crc(&crc,tag-2,aux_type2size(tag)+3); update_digest_line(hash, pass, dline, crc, 3); update_digest_line(hash, pass, dline_all, crc, 3); free(seq); free(qual); return 0; }
int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) { bam1_t *b = 0; kstring_t r, q; int r_tid = -1; uint32_t *cigar2 = 0; int ret = 0, n2 = 0, m2 = 0, *posmap = 0; b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; int read_ret; while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in' // Cannot depad unmapped CRAM data if (b->core.flag & BAM_FUNMAP) goto next_seq; uint32_t *cigar = bam_get_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); r_tid = b->core.tid; if (0!=unpad_seq(b, &r)) { fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); return -1; }; if (h->target_len[r_tid] != r.l) { fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); return -1; } if (fai) { // Check the embedded reference matches the FASTA file if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); return -1; } assert(r.l == q.l); int i; for (i = 0; i < r.l; ++i) { if (r.s[i] != q.s[i]) { // Show gaps as ASCII 45 fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", h->target_name[b->core.tid], i+1, r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); return -1; } } } write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = update_posmap(posmap, r); } else if (b->core.n_cigar > 0) { int i, k, op; if (b->core.tid < 0) { fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b)); return -1; } else if (b->core.tid == r_tid) { ; // good case, reference available //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); } else if (fai) { if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); r_tid = b->core.tid; // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); } else { fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); return -1; } if (0!=unpad_seq(b, &q)) { fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b)); return -1; }; if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[0]); } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) { write_cigar(cigar2, n2, m2, cigar[0]); if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[1]); } } /* Determine CIGAR operator for each base in the aligned read */ for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); /* Include any pads if starts with an insert */ if (q.s[0] == BAM_CINS) { for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD)); k = 0; } else if (q.s[0] == BAM_CPAD) { // Join 'k' CPAD to our first cigar op CPAD too. for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); } else { k = 0; } /* Count consecutive CIGAR operators to turn into a CIGAR string */ for (i = 1, k++, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) { if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) { write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]); } write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); } /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */ int pre_op, post_op; for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) { pre_op = bam_cigar_op(cigar2[i-2]); post_op = bam_cigar_op(cigar2[i]); /* Note don't need to check for X/= as code above will use M only */ if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) { /* This is a redundant P operator */ cigar2[i-1] = 0; // i.e. 0M /* If had same operator either side, combine them in post_op */ if (pre_op == post_op) { /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/ cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op); cigar2[i-2] = 0; // i.e. 0M } } } /* Remove the zero'd operators (0M) */ for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); } /* Even unmapped reads can have a POS value, e.g. if their mate was mapped */ if (b->core.pos != -1) b->core.pos = posmap[b->core.pos]; if (b->core.mtid < 0 || b->core.mpos < 0) { /* Nice case, no mate to worry about*/ // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b)); /* TODO - Warning if FLAG says mate should be mapped? */ /* Clean up funny input where mate position is given but mate reference is missing: */ b->core.mtid = -1; b->core.mpos = -1; } else if (b->core.mtid == b->core.tid) { /* Nice case, same reference */ // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b)); b->core.mpos = posmap[b->core.mpos]; } else { /* Nasty case, Must load alternative posmap */ // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); if (!fai) { fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); return -1; } /* Temporarily load the other reference sequence */ if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); return -1; } posmap = update_posmap(posmap, r); b->core.mpos = posmap[b->core.mpos]; /* Restore the reference and posmap*/ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); } /* Most reads will have been moved so safest to always recalculate the BIN value */ b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b)); next_seq: sam_write1(out, h, b); } if (read_ret < -1) { fprintf(stderr, "[depad] truncated file.\n"); ret = 1; } free(r.s); free(q.s); free(posmap); bam_destroy1(b); return ret; }