SEXP get_leftovers (SEXP bam, SEXP index, SEXP processed) try { BamFile bf(bam, index); BamRead br; if (!isString(processed)) { throw std::runtime_error("names of processed chromosomes should be strings"); } const int nchr=LENGTH(processed); std::set<std::string> already_there; for (int i=0; i<nchr; ++i) { already_there.insert(std::string(CHAR(STRING_ELT(processed, i)))); } // Getting the reads mapped to chromosomes we didn't look at due to 'restrict'. int leftovers=0; std::set<std::string>::iterator iat; for (int cid=0; cid<bf.header->n_targets; ++cid) { iat=already_there.find(std::string(bf.header->target_name[cid])); if (iat!=already_there.end()) { continue; } BamIterator biter(bf, cid, 0, bf.header->target_len[cid]); while (bam_itr_next(bf.in, biter.iter, br.read) >= 0){ ++leftovers; } } // Also getting the unmapped guys. BamIterator biter(bf); while (bam_itr_next(bf.in, biter.iter, br.read) >= 0){ ++leftovers; } return(ScalarInteger(leftovers)); } catch (std::exception &e) { return mkString(e.what()); }
/** * Reads next record, hides the random access of different regions from the user. */ bool BAMOrderedReader::read(bam1_t *s) { if (random_access_enabled) { while(true) { if (itr && bam_itr_next(sam, itr, s)>=0) { return true; } else if (!initialize_next_interval()) { return false; } } } else { if (bam_read1(sam->fp.bgzf, s)>=0) { //todo: filter via interval tree //if found in tree, return true else false return true; } else { return false; } } return false; };
SEXP extract_pair_data(SEXP bam, SEXP index, SEXP chr, SEXP start, SEXP end, SEXP mapq, SEXP dedup, SEXP diagnostics) try { // Checking input values. if (!isInteger(mapq) || LENGTH(mapq)!=1) { throw std::runtime_error("mapping quality should be an integer scalar"); } const int minqual=asInteger(mapq); if (!isLogical(dedup) || LENGTH(dedup)!=1) { throw std::runtime_error("duplicate removal should be a logical scalar"); } const bool rmdup=asLogical(dedup); if (!isLogical(diagnostics) || LENGTH(diagnostics)!=1) { throw std::runtime_error("diagnostics specification should be a logical scalar"); } const bool getnames=asLogical(diagnostics); // Initializing odds and ends. BamFile bf(bam, index); BamRead br; BamIterator biter(bf, chr, start, end); OutputContainer oc(getnames); typedef std::map<std::pair<int, std::string>, AlignData> Holder; std::deque<Holder> all_holders(4); // four holders, one for each strand/first combination; cut down searches. std::pair<int, std::string> current; Holder::iterator ith; int curpos, mate_pos; AlignData algn_data; bool am_mapped, is_first; bool mate_is_in; std::set<std::string> identical_pos; std::set<std::string>::iterator itip; int last_identipos=-1; while (bam_itr_next(bf.in, biter.iter, br.read) >= 0){ ++oc.totals; curpos = (br.read->core).pos + 1; // Getting 1-indexed position. br.extract_data(algn_data); am_mapped=br.is_well_mapped(minqual, rmdup); /* Reasons to not add a read: */ // // If we can see that it is obviously unmapped (IMPOSSIBLE for a sorted file). // if (((br.read -> core).flag & BAM_FUNMAP)!=0) { // // We don't filter by additional mapping criteria, as we need to search 'holder' to pop out the partner and to store diagnostics. // continue; // } // If it's a singleton. if (((br.read -> core).flag & BAM_FPAIRED)==0) { if (am_mapped) { oc.add_single(curpos, algn_data); } continue; } // Or, if we can see that its partner is obviously unmapped. if (((br.read -> core).flag & BAM_FMUNMAP)!=0) { if (am_mapped) { oc.add_onemapped(curpos, algn_data); } continue; } // Or if it's inter-chromosomal. is_first=(((br.read->core).flag & BAM_FREAD1)!=0); if (is_first==(((br.read->core).flag & BAM_FREAD2)!=0)) { std::stringstream err; err << "read '" << bam_get_qname(br.read) << "' must be either first or second in the pair"; throw std::runtime_error(err.str()); } if ((br.read -> core).mtid!=(br.read -> core).tid) { if (am_mapped) { oc.add_interchr(curpos, algn_data, bam_get_qname(br.read), is_first); } continue; } /* Checking the map and adding it if it doesn't exist. */ current.second.assign(bam_get_qname(br.read)); mate_pos = (br.read -> core).mpos + 1; // 1-indexed position, again. mate_is_in=false; if (mate_pos < curpos) { mate_is_in=true; } else if (mate_pos == curpos) { // Identical mpos to curpos needs careful handling to figure out whether we've already seen it. if (curpos!=last_identipos) { identical_pos.clear(); last_identipos=curpos; } itip=identical_pos.lower_bound(current.second); if (itip!=identical_pos.end() && !(identical_pos.key_comp()(current.second, *itip))) { mate_is_in=true; identical_pos.erase(itip); } else { identical_pos.insert(itip, current.second); } } if (mate_is_in) { current.first = mate_pos; Holder& holder=all_holders[int(!is_first) + 2*int(bam_is_mrev(br.read))]; ith=holder.find(current); if (ith != holder.end()) { if (!am_mapped) { // Searching to pop out the mate, to reduce the size of 'holder' for the remaining searches (and to store diagnostics). oc.add_onemapped((ith->first).first, ith->second); holder.erase(ith); continue; } oc.add_genuine(curpos, algn_data, (ith->first).first, ith->second, is_first); holder.erase(ith); } else if (am_mapped) { // Only possible if the mate didn't get added because 'am_mapped' was false. oc.add_onemapped(curpos, algn_data); } } else if (am_mapped) { current.first = curpos; Holder& holder=all_holders[int(is_first) + 2*int(algn_data.is_reverse)]; holder[current] = algn_data; } } // Leftovers treated as one_unmapped; marked as paired, but the mate is not in file. for (size_t h=0; h<all_holders.size(); ++h) { Holder& holder=all_holders[h]; for (ith=holder.begin(); ith!=holder.end(); ++ith) { oc.add_onemapped((ith->first).first, ith->second); } holder.clear(); } // Storing all output. SEXP output=PROTECT(allocVector(VECSXP, getnames ? 9 : 2)); try { SET_VECTOR_ELT(output, 0, allocVector(VECSXP, 2)); SEXP left=VECTOR_ELT(output, 0); store_int_output(left, 0, oc.forward_pos_out); store_int_output(left, 1, oc.forward_len_out); SET_VECTOR_ELT(output, 1, allocVector(VECSXP, 2)); SEXP right=VECTOR_ELT(output, 1); store_int_output(right, 0, oc.reverse_pos_out); store_int_output(right, 1, oc.reverse_len_out); if (getnames) { SET_VECTOR_ELT(output, 2, ScalarInteger(oc.totals)); SET_VECTOR_ELT(output, 3, allocVector(VECSXP, 2)); SEXP singles=VECTOR_ELT(output, 3); store_int_output(singles, 0, oc.single_pos); store_int_output(singles, 1, oc.single_len); SET_VECTOR_ELT(output, 4, allocVector(VECSXP, 2)); SEXP first=VECTOR_ELT(output, 4); store_int_output(first, 0, oc.ufirst_pos); store_int_output(first, 1, oc.ufirst_len); SET_VECTOR_ELT(output, 5, allocVector(VECSXP, 2)); SEXP second=VECTOR_ELT(output, 5); store_int_output(second, 0, oc.usecond_pos); store_int_output(second, 1, oc.usecond_len); SET_VECTOR_ELT(output, 6, allocVector(VECSXP, 2)); SEXP onemap=VECTOR_ELT(output, 6); store_int_output(onemap, 0, oc.onemap_pos); store_int_output(onemap, 1, oc.onemap_len); SET_VECTOR_ELT(output, 7, allocVector(VECSXP, 3)); SEXP interchr1=VECTOR_ELT(output, 7); store_int_output(interchr1, 0, oc.ifirst_pos); store_int_output(interchr1, 1, oc.ifirst_len); store_names(interchr1, 2, oc.interchr_names_1); SET_VECTOR_ELT(output, 8, allocVector(VECSXP, 3)); SEXP interchr2=VECTOR_ELT(output, 8); store_int_output(interchr2, 0, oc.isecond_pos); store_int_output(interchr2, 1, oc.isecond_len); store_names(interchr2, 2, oc.interchr_names_2); } } catch (std::exception &e) { UNPROTECT(1); throw; } UNPROTECT(1); return output; } catch (std::exception &e) { return mkString(e.what()); }
int main_samview(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; while ((c = getopt(argc, argv, "IbSl:t:")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSI] [-l level] <in.bam>|<in.sam> [region]\n"); return 1; } strcpy(moder, "r"); if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder, fn_ref); h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); if ((flag&4) == 0) { // SAM/BAM output htsFile *out; char modew[8]; strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&2) strcat(modew, "b"); out = hts_open("-", modew, 0); sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = bam_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while (bam_itr_next((BGZF*)in->fp, iter, b) >= 0) sam_write1(out, h, b); hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while (sam_read1(in, h, b) >= 0) sam_write1(out, h, b); sam_close(out); } bam_destroy1(b); bam_hdr_destroy(h); sam_close(in); return 0; }
int main(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; htsFile *out; char modew[8]; int r = 0, exit_code = 0; while ((c = getopt(argc, argv, "IbDCSl:t:")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'D': flag |= 4; break; case 'C': flag |= 8; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSCSI] [-l level] <in.bam>|<in.sam>|<in.cram> [region]\n"); return 1; } strcpy(moder, "r"); if (flag&4) strcat(moder, "c"); else if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder); h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&8) strcat(modew, "c"); else if (flag&2) strcat(modew, "b"); out = hts_open("-", modew); /* CRAM output */ if (flag & 8) { // Parse input header and use for CRAM output out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text); // Create CRAM references arrays if (fn_ref) cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref); else // Attempt to fill out a cram->refs[] array from @SQ headers cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL); } sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = bam_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while ((r = bam_itr_next(in, iter, b)) >= 0) { if (sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } } hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while ((r = sam_read1(in, h, b)) >= 0) { if (sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } } sam_close(out); if (r < -1) { fprintf(stderr, "Error parsing input.\n"); exit_code = 1; } bam_destroy1(b); bam_hdr_destroy(h); sam_close(in); return exit_code; }
int calcCoverage(char *fName, Slice *slice, htsFile *in, hts_idx_t *idx, int flags) { int ref; int begRange; int endRange; char region[1024]; char region_name[512]; if (Slice_getChrStart(slice) != 1) { fprintf(stderr, "Currently only allow a slice start position of 1\n"); return 1; } if (flags & M_UCSC_NAMING) { sprintf(region,"chr%s", Slice_getSeqRegionName(slice)); } else { sprintf(region,"%s", Slice_getSeqRegionName(slice)); } bam_hdr_t *header = bam_hdr_init(); header = bam_hdr_read(in->fp.bgzf); ref = bam_name2id(header, region); if (ref < 0) { fprintf(stderr, "Invalid region %s\n", region); exit(1); } sprintf(region,"%s:%ld-%ld", region_name, Slice_getSeqRegionStart(slice), Slice_getSeqRegionEnd(slice)); if (hts_parse_reg(region, &begRange, &endRange) == NULL) { fprintf(stderr, "Could not parse %s\n", region); exit(2); } bam_hdr_destroy(header); hts_itr_t *iter = sam_itr_queryi(idx, ref, begRange, endRange); bam1_t *b = bam_init1(); Coverage *coverage = calloc(Slice_getLength(slice),sizeof(Coverage)); long counter = 0; long overlapping = 0; long bad = 0; int startIndex = 0; while (bam_itr_next(in, iter, b) >= 0) { if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) { bad++; continue; } int end; //end = bam_calend(&b->core, bam1_cigar(b)); end = bam_endpos(b); // There is a special case for reads which have zero length and start at begRange (so end at begRange ie. before the first base we're interested in). // That is the reason for the || end == begRange test if (end == begRange) { continue; } counter++; if (!(counter%1000000)) { if (verbosity > 1) { printf("."); } fflush(stdout); } // Remember: b->core.pos is zero based! int cigInd; int refPos; int readPos; uint32_t *cigar = bam_get_cigar(b); for (cigInd = readPos = 0, refPos = b->core.pos; cigInd < b->core.n_cigar; ++cigInd) { int k; int lenCigBlock = cigar[cigInd]>>4; int op = cigar[cigInd]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (k = 0; k < lenCigBlock; ++k) { //if (ref[refPos+k] == 0) break; // out of boundary coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; readPos += lenCigBlock; } else if (op == BAM_CDEL) { for (k = 0; k < lenCigBlock; ++k) { // if (ref[refPos+k] == 0) break; coverage[refPos+k].coverage++; } if (k < lenCigBlock) break; refPos += lenCigBlock; } else if (op == BAM_CSOFT_CLIP) { readPos += lenCigBlock; } else if (op == BAM_CHARD_CLIP) { } else if (op == BAM_CINS) { readPos += lenCigBlock; } else if (op == BAM_CREF_SKIP) { refPos += lenCigBlock; } } #ifdef DONE int j; int done = 0; int hadOverlap = 0; for (j=startIndex; j < Vector_getNumElement(genes) && !done; j++) { Gene *gene = Vector_getElementAt(genes,j); if (!gene) { continue; } // Remember: b->core.pos is zero based! if (b->core.pos < Gene_getEnd(gene) && end >= Gene_getStart(gene)) { int k; int doneGene = 0; for (k=0; k<Gene_getTranscriptCount(gene) && !doneGene; k++) { Transcript *trans = Gene_getTranscriptAt(gene,k); if (b->core.pos < Transcript_getEnd(trans) && end >= Transcript_getStart(trans)) { int m; for (m=0; m<Transcript_getExonCount(trans) && !doneGene; m++) { Exon *exon = Transcript_getExonAt(trans,m); if (b->core.pos < Exon_getEnd(exon) && end >= Exon_getStart(exon)) { // Only count as overlapping once (could be that a read overlaps more than one gene) if (!hadOverlap) { overlapping++; hadOverlap = 1; } gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); gs->score++; doneGene = 1; } } } } } else if (Gene_getStart(gene) > end) { done = 1; } else if (Gene_getEnd(gene) < b->core.pos+1) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); if (verbosity > 1) { printf("Removing gene %s (index %d) with extent %d to %d\n", Gene_getStableId(gene), gs->index, Gene_getStart(gene), Gene_getEnd(gene)); } Vector_setElementAt(genes,j,NULL); // Magic (very important for speed) - move startIndex to first non null gene int n; startIndex = 0; for (n=0;n<Vector_getNumElement(genes);n++) { void *v = Vector_getElementAt(genes,n); if (v != NULL) { break; } startIndex++; } if (verbosity > 1) { printf("startIndex now %d\n",startIndex); } } } #endif } if (verbosity > 1) { printf("\n"); } #ifdef DONE // Print out read counts for what ever's left in the genes array int n; for (n=0;n<Vector_getNumElement(genes);n++) { Gene *gene = Vector_getElementAt(genes,n); if (gene != NULL) { gs = IDHash_getValue(geneCountsHash, Gene_getDbID(gene)); printf("Gene %s (%s) score %ld\n",Gene_getStableId(gene), Gene_getDisplayXref(gene) ? DBEntry_getDisplayId(Gene_getDisplayXref(gene)) : "", gs->score); } } #endif printf("Read %ld reads. Number of bad reads (unmapped, qc fail, secondary, dup) %ld\n", counter, bad); long i; for (i=0; i< Slice_getLength(slice); i++) { printf("%ld %ld\n", i+1, coverage[i].coverage); } sam_itr_destroy(iter); bam_destroy1(b); return 1; }