/** * Reads next record, hides the random access of different regions from the user. */ bool BAMOrderedReader::read(bam1_t *s) { if (random_access_enabled) { while(true) { if (itr && bam_itr_next(sam, itr, s)>=0) { return true; } else if (!initialize_next_interval()) { return false; } } } else { if (bam_read1(sam->fp.bgzf, s)>=0) { //todo: filter via interval tree //if found in tree, return true else false return true; } else { return false; } } return false; };
// This function reads a BAM alignment from one BAM file. static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; return ret; }
// currently, this function ONLY works if each read has one hit void bam_mating_core(bamFile in, bamFile out) { bam_header_t *header; bam1_t *b[2]; int curr, has_prev, pre_end = 0, cur_end; kstring_t str; str.l = str.m = 0; str.s = 0; header = bam_header_read(in); bam_header_write(out, header); b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while (bam_read1(in, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.tid < 0) continue; cur_end = bam_calend(&cur->core, bam1_cigar(cur)); if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (has_prev) { if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; else cur->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; else pre->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } bam_template_cigar(pre, cur, &str); bam_write1(out, pre); bam_write1(out, cur); has_prev = 0; } else { // unpaired or singleton pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; if (pre->core.flag & BAM_FPAIRED) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; } bam_write1(out, pre); } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (has_prev) bam_write1(out, b[1-curr]); bam_header_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); }
void edwBamToWig(char *input, char *output) /* edwBamToWig - Convert a bam file to a wig file by measuring depth of coverage, optionally adjusting hit size to average for library.. */ { FILE *f = mustOpen(output, "w"); /* Open file and get header for it. */ samfile_t *sf = samopen(input, "rb", NULL); if (sf == NULL) errnoAbort("Couldn't open %s.\n", input); bam_header_t *head = sf->header; if (head == NULL) errAbort("Aborting ... Bad BAM header in file: %s", input); /* Scan through input populating genome range trees */ struct genomeRangeTree *grt = genomeRangeTreeNew(); bam1_t one = {}; for (;;) { /* Read next record. */ if (bam_read1(sf->x.bam, &one) < 0) break; if (one.core.tid >= 0 && one.core.n_cigar > 0) { char *chrom = head->target_name[one.core.tid]; int start = one.core.pos; int end = start + one.core.l_qseq; if (one.core.flag & BAM_FREVERSE) { start -= clPad; } else { end += clPad; } struct rbTree *rt = genomeRangeTreeFindOrAddRangeTree(grt,chrom); rangeTreeAddToCoverageDepth(rt, start, end); } } /* Convert genome range tree into output wig */ /* Get list of chromosomes. */ struct hashEl *hel, *helList = hashElListHash(grt->hash); for (hel = helList; hel != NULL; hel = hel->next) { char *chrom = hel->name; struct rbTree *rt = hel->val; struct range *range, *rangeList = rangeTreeList(rt); for (range = rangeList; range != NULL; range = range->next) { fprintf(f, "%s\t%d\t%d\t%d\n", chrom, range->start, range->end, ptToInt(range->val)); } } carefulClose(&f); }
boolean bamIsSortedByTarget(char *fileName, int maxToCheck) /* Return TRUE if bam is sorted by target for at least the first bits. */ { int leftToCheck = maxToCheck; struct hash *targetHash = hashNew(0); boolean result = TRUE; /* Open bam/sam file and set up basic I/O vars on it. */ samfile_t *sf = samopen(fileName, "rb", NULL); bam_header_t *bamHeader = sf->header; bam1_t one; ZeroVar(&one); int err; char lastTarget[PATH_LEN] = ""; int lastPos = 0; /* Loop through while still haven't hit our max and file still has data */ while ((err = bam_read1(sf->x.bam, &one)) >= 0) { if (--leftToCheck < 0) { break; } /* Get target, skipping read if it's not aligned well enough to have a target. */ int32_t tid = one.core.tid; if (tid < 0) continue; char *target = bamHeader->target_name[tid]; int pos = one.core.pos; /* If we are on same target then make sure we are in ascending order. */ if (sameString(target, lastTarget)) { if (pos < lastPos) { result = FALSE; break; } } else { /* If sorted should not go back to a new chromosome. Use hash to check this */ if (hashLookup(targetHash, target)) { result = FALSE; break; } hashAdd(targetHash, target, NULL); safef(lastTarget, sizeof(lastTarget), "%s", target); } lastPos = pos; } hashFree(&targetHash); return result; }
// This function reads a BAM alignment from one BAM file. static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); if (!(b->core.flag&BAM_FUNMAP)) { if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP; } return ret; }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((res = bam_read1(bs->fp, b)) >= 0) { uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; s = bam1_seq(b); q = bam1_qual(b); p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } if (bam1_strand(b)) { // then reverse seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)bam1_qname(b)); if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; }
int bam_pad2unpad(bamFile in, bamFile out) { bam_header_t *h; bam1_t *b; kstring_t r, q; uint32_t *cigar2 = 0; int n2 = 0, m2 = 0, *posmap = 0; h = bam_header_read(in); bam_header_write(out, h); b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; while (bam_read1(in, b) >= 0) { uint32_t *cigar = bam1_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { int i, k; unpad_seq(b, &r); write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = realloc(posmap, r.m * sizeof(int)); for (i = k = 0; i < r.l; ++i) { posmap[i] = k; // note that a read should NOT start at a padding if (r.s[i]) ++k; } } else { int i, k, op; unpad_seq(b, &q); if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]); for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); for (i = k = 1, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH) cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0; for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); b->core.pos = posmap[b->core.pos]; } bam_write1(out, b); } free(r.s); free(q.s); free(posmap); bam_destroy1(b); bam_header_destroy(h); return 0; }
int main(int argc, char** argv) { if(argc < 3) { printf("No input nor output files provided"); return -1; } bamFile in = bam_open(argv[1], "r"); bam_header_t* header; if (in == NULL) { printf("opening input file failed"); return -1; } bam1_t* b = bam_init1(); bamFile out = bam_open(argv[2], "w"); if (out == NULL) { printf("opening input file failed"); return -1; } header = bam_header_read(in); if(bam_header_write(out, header) < 0) { printf("writing header failed"); } long nextPrunedId; if(!scanf ("%lu", &nextPrunedId)) { printf("warning: no ids provided"); return -1; } long id = 0; while (bam_read1(in, b) >= 0) { // write BAM back if (nextPrunedId != id++) { bam_write1(out, b); } else { // fprintf(stderr, "pruning: id: %lu, pos: %d, length: %d\n", nextPrunedId, b->core.pos, b->core.l_qseq); if(!scanf ("%lu", &nextPrunedId)) { break; } } } // closing all resources bam_header_destroy(header); bam_close(in); bam_close(out); bam_destroy1(b); return 0; }
static inline int TGM_BamInStreamLoadNext(TGM_BamInStream* pBamInStream) { // for the bam alignment array, if we need to expand its space // we have to initialize those newly created bam alignment // and update the query name hash since the address of those // bam alignments are changed after expanding pBamInStream->pNewNode = TGM_BamNodeAlloc(pBamInStream->pMemPool); if (pBamInStream->pNewNode == NULL) TGM_ErrQuit("ERROR: Too many unpaired reads are stored in the memory. Please use smaller bin size or disable searching pair genomically.\n"); int ret = bam_read1(pBamInStream->fpBamInput, &(pBamInStream->pNewNode->alignment)); return ret; }
void convert_bam_to_sam(char* bam_input, char* sam_input) { int read_bytes; bam1_t* bam_p = bam_init1(); char* bam_string; LOG_DEBUG("CONVERT-START: bam to sam\n"); //open BAM file for read if (time_flag) { start_timer(t1_convert); } bam_file_t* bam_file_p = bam_fopen_mode(bam_input, NULL, "r"); //open SAM file for write, SAM file is a text file!!! FILE* sam_fd = fopen(sam_input, "w"); if (sam_fd == NULL) { char log_message[200]; sprintf(log_message, "Error opening file '%.150s' in mode 'r' !!!!!\n", sam_input); LOG_FATAL(log_message); } //header for BAM file has been done in the opening bam_header_t* bam_header_p = bam_file_p->bam_header_p; //write header text to SAM file fprintf(sam_fd, "%s", bam_header_p->text); //write string alignments to SAM file while ((read_bytes = bam_read1(bam_file_p->bam_fd, bam_p)) > 0) { bam_string = bam_format1(bam_header_p, bam_p); fprintf(sam_fd, "%s\n", bam_string); free(bam_string); // it was allocated by the sam-tools, we must free it !! num_alignments++; } //close BAM and SAM files, free bam alignment and bam file object bam_fclose(bam_file_p); fclose(sam_fd); bam_destroy1(bam_p); if (time_flag) { stop_timer(t1_convert, t2_convert, convert_time); } //number_of_batchs = 1, convention value for statistics (not real batch) number_of_batchs = 1; LOG_DEBUG("CONVERT-START: bam to sam\n"); }
int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) { bam_plbuf_t *buf; int ret; bam1_t *b; b = bam_init1(); buf = bam_plbuf_init(func, func_data); bam_plbuf_set_mask(buf, mask); while ((ret = bam_read1(fp, b)) >= 0) bam_plbuf_push(b, buf); bam_plbuf_push(0, buf); bam_plbuf_destroy(buf); bam_destroy1(b); return 0; }
bam_flagstat_t *bam_flagstat_core(bamFile fp) { bam_flagstat_t *s; bam1_t *b; bam1_core_t *c; int ret; s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); b = bam_init1(); c = &b->core; while ((ret = bam_read1(fp, b)) >= 0) flagstat_loop(s, c); bam_destroy1(b); if (ret != -1) fprintf(pysamerr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); return s; }
static int mplp_func(void *data, bam1_t *b) { extern int bam_realn(bam1_t *b, const char *ref); extern int bam_prob_realn_core(bam1_t *b, const char *ref, int); extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres); mplp_aux_t *ma = (mplp_aux_t*)data; int ret, skip = 0; do { int has_ref; ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); if (ret < 0) break; if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads skip = 1; continue; } if (ma->conf->bed) { // test overlap skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); if (skip) continue; } if (ma->conf->rghash) { // exclude read groups uint8_t *rg = bam_aux_get(b, "RG"); skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0); if (skip) continue; } if (ma->conf->flag & MPLP_ILLUMINA13) { int i; uint8_t *qual = bam1_qual(b); for (i = 0; i < b->core.l_qseq; ++i) qual[i] = qual[i] > 31? qual[i] - 31 : 0; } has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0; skip = 0; if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1); if (has_ref && ma->conf->capQ_thres > 10) { int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres); if (q < 0) skip = 1; else if (b->core.qual > q) b->core.qual = q; } else if (b->core.qual < ma->conf->min_mq) skip = 1; else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; } while (skip); return ret; }
int samread(samfile_t *fp, bam1_t *b) { if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b); else return sam_read1(fp->x.tamr, fp->header, b); }
void signalFromBAM(const string bamFileName, const string sigFileName, Parameters P) { bam1_t *bamA; bamA=bam_init1(); double nMult=0, nUniq=0; if (P.outWigFlags.norm==1) {//count reads in the BAM file BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); bam_hdr_t *bamHeader=bam_hdr_read(bamIn); while ( true ) {//until the end of file int bamBytes1=bam_read1(bamIn, bamA); if (bamBytes1<0) break; //end of file if (bamA->core.tid<0) continue; //unmapped read // if ( !std::regex_match(chrName.at(bamA->core.tid),std::regex(P.outWigReferencesPrefix))) continue; //reference does not mathc required references if ( P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) continue; //reference does not match required references uint8_t* aNHp=bam_aux_get(bamA,"NH"); if (aNHp!=NULL) { uint32_t aNH=bam_aux2i(aNHp); if (aNH==1) {//unique mappers ++nUniq; } else if (aNH>1) { nMult+=1.0/aNH; }; }; }; bgzf_close(bamIn); }; BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); bam_hdr_t *bamHeader=bam_hdr_read(bamIn); int sigN=P.outWigFlags.strand ? 4 : 2; double *normFactor=new double[sigN]; ofstream **sigOutAll=new ofstream* [sigN]; string* sigOutFileName=new string[sigN]; sigOutFileName[0]=sigFileName+".Unique.str1.out"; sigOutFileName[1]=sigFileName+".UniqueMultiple.str1.out"; if (P.outWigFlags.strand) { sigOutFileName[2]=sigFileName+".Unique.str2.out"; sigOutFileName[3]=sigFileName+".UniqueMultiple.str2.out"; }; for (int ii=0; ii<sigN; ii++) { sigOutFileName[ii]+= (P.outWigFlags.format==0 ? ".bg" : ".wig"); sigOutAll[ii]=new ofstream ( sigOutFileName[ii].c_str() ); }; if (P.outWigFlags.norm==0) {//raw counts normFactor[0]=1; normFactor[1]=1; } else if (P.outWigFlags.norm==1) {//normlaized normFactor[0]=1.0e6 / nUniq; normFactor[1]=1.0e6 / (nUniq+nMult); for (int is=0;is<sigN;is++) {//formatting double output *sigOutAll[is]<<setiosflags(ios::fixed) << setprecision(5); }; }; if (P.outWigFlags.strand) { normFactor[2]=normFactor[0]; normFactor[3]=normFactor[1]; }; int iChr=-999; double *sigAll=NULL; uint32_t chrLen=0; while ( true ) {//until the end of file int bamBytes1=bam_read1(bamIn, bamA); if (bamA->core.tid!=iChr || bamBytes1<0) { //output to file if (iChr!=-999) {//iChr=-999 marks chromosomes that are not output, including unmapped reads for (int is=0;is<sigN;is++) { if (P.outWigFlags.format==1) { *sigOutAll[is] <<"variableStep chrom="<<bamHeader->target_name[iChr] <<"\n"; }; double prevSig=0; for (uint32_t ig=0;ig<chrLen;ig++) { double newSig=sigAll[sigN*ig+is]; if (P.outWigFlags.format==0) {//bedGraph if (newSig!=prevSig) { if (prevSig!=0) {//finish previous record *sigOutAll[is] <<ig<<"\t"<<prevSig*normFactor[is] <<"\n"; //1-based end }; if (newSig!=0) { *sigOutAll[is] << bamHeader->target_name[iChr] <<"\t"<< ig <<"\t"; //0-based beginning }; prevSig=newSig; }; } else if (P.outWigFlags.format==1){//wiggle if (newSig!=0) { *sigOutAll[is] <<ig+1<<"\t"<<newSig*normFactor[is] <<"\n"; }; }; }; }; }; if (bamBytes1<0) {//no more reads break; }; iChr=bamA->core.tid; if ( iChr==-1 || (P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) ) { iChr=-999; continue; //reference does not match required references }; chrLen=bamHeader->target_len[iChr]+1;//one extra base at the end which sohuld always be 0 delete [] sigAll; sigAll= new double[sigN*chrLen]; memset(sigAll, 0, sizeof(*sigAll)*sigN*chrLen); }; // uint32_t nCigar =(bamA->core.flag<<16)>>16; // uint32_t mapFlag=bamA->core.flag>>16; // uint32_t mapQ=(bamA->core.flag<<16)>>24; #define BAM_CIGAR_OperationShift 4 #define BAM_CIGAR_LengthBits 28 #define BAM_CIGAR_M 0 #define BAM_CIGAR_I 1 #define BAM_CIGAR_D 2 #define BAM_CIGAR_N 3 #define BAM_CIGAR_S 4 #define BAM_CIGAR_H 5 #define BAM_CIGAR_P 6 #define BAM_CIGAR_EQ 7 #define BAM_CIGAR_X 8 //by default, alignments marked as duplicate are not processed if ( (bamA->core.flag & 0x400) > 0 ) continue; //NH attribute uint8_t* aNHp=bam_aux_get(bamA,"NH"); uint32_t aNH; if (aNHp==NULL) { aNH=1; //no NH tag: assume NH=1 //continue; //do not process lines without NH field } else { aNH=bam_aux2i(bam_aux_get(bamA,"NH")); //write a safer function allowing for lacking NH tag }; if (aNH==0) continue; //do not process lines without NH=0 uint32_t aG=bamA->core.pos; uint32_t iStrand=0; if (P.outWigFlags.strand) {//strand for stranded data from SAM flag iStrand= ( (bamA->core.flag & 0x10) > 0 ) == ( (bamA->core.flag & 0x80) == 0 );//0/1 for +/- }; if (P.outWigFlags.type==1) {//5' of the1st read signal only, RAMPAGE/CAGE if ( (bamA->core.flag & 0x80)>0) continue; //skip if this the second mate if (iStrand==0) { if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci continue; //record only the first position }; }; uint32_t* cigar=(uint32_t*) (bamA->data+bamA->core.l_qname); for (uint32_t ic=0; ic<bamA->core.n_cigar; ic++) { uint32_t cigOp=(cigar[ic]<<BAM_CIGAR_LengthBits)>>BAM_CIGAR_LengthBits; uint32_t cigL=cigar[ic]>>BAM_CIGAR_OperationShift; switch (cigOp) { case(BAM_CIGAR_D): case(BAM_CIGAR_N): aG+=cigL; break; case(BAM_CIGAR_M): if (P.outWigFlags.type==0 || (P.outWigFlags.type==2 && (bamA->core.flag & 0x80)>0 )) {//full signal, or second mate onyl signal for (uint32_t ig=0;ig<cigL;ig++) { if (aG>=chrLen) { cerr << "BUG: alignment extends past chromosome in signalFromBAM.cpp\n"; exit(-1); }; if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci aG++; }; } else { aG+=cigL; }; }; }; if (P.outWigFlags.type==1) {//full signal --aG; if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci }; }; delete [] sigAll; for (int is=0; is<sigN; is++) {// flush/close all signal files sigOutAll[is]->flush(); sigOutAll[is]->close(); }; };
int main(int argc, char *argv[]) { bamFile in; sqlite3 * db; sqlite3_stmt * stmt; char * sErrMsg = NULL; char * tail = 0; int nRetCode; char sSQL [BUFFER_SIZE] = "\0"; char database[BUFFER_SIZE]; clock_t startClock,startClock2; if (argc != 2) { fprintf(stderr, "Usage: bamRindex <in.bam>\n"); return 1; } // Open file and exit if error //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); //fprintf(stderr,"Options ok\n"); in = bam_open(argv[1], "rb"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } //fprintf(stderr,"BAM opened\n"); assert(strcpy(database,argv[1])!=NULL); assert(strcat(database,".ridx")!=NULL); remove(database); // *********** // Read header bam_header_t *header; header = bam_header_read(in); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); unsigned long num_alns=0; /*********************************************/ /* Open the Database and create the Schema */ // TODO: check the errors sqlite3_open(database, &db); sqlite3_exec(db, TABLE, NULL, NULL, &sErrMsg); // create the table SQLITE_CHECK_ERROR(); startClock = clock(); sqlite3_exec(db, "PRAGMA synchronous = 0;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); sqlite3_exec(db, "PRAGMA journal_mode = OFF;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); // Use up to 8GB of memory sqlite3_exec(db, "PRAGMA cache_size = -8000000;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); sqlite3_exec(db, "BEGIN TRANSACTION;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); while(bam_read1(in,aln)>=0) { // read alignment //aln->core.tid < 0 ? uint8_t *nh = bam_aux_get(aln, "NH"); uint8_t *nm = bam_aux_get(aln, "NM"); uint8_t *xs = bam_aux_get(aln, "XS"); BOOLEAN isPrimary; BOOLEAN isMapped; BOOLEAN notMapped; BOOLEAN isDuplicate; BOOLEAN isNotPassingQualityControls; BOOLEAN isPaired; BOOLEAN isSecondMateRead,isProperPair; //secondary alignment notMapped=(aln->core.flag & BAM_FUNMAP) ? TRUE: FALSE; //notMapped=((aln->core.flag & BAM_FUNMAP) || (aln->core.mtid ==0)) ? TRUE: FALSE; isMapped=!notMapped; isPrimary= (aln->core.flag & BAM_FSECONDARY) ? FALSE:TRUE; isProperPair=(aln->core.flag & BAM_FPROPER_PAIR) ? TRUE:FALSE; isPaired=(aln->core.flag & BAM_FPAIRED ) ? TRUE:FALSE; isSecondMateRead=(aln->core.flag & BAM_FREAD2 ) ? TRUE: FALSE; isNotPassingQualityControls=(aln->core.flag & BAM_FQCFAIL ) ? TRUE:FALSE; isDuplicate=(aln->core.flag & BAM_FDUP) ? TRUE: FALSE; BOOLEAN isSpliced=FALSE; BOOLEAN hasSimpleCigar=TRUE; int nSpliced=0; int i; if (aln->core.n_cigar != 0) { for (i = 0; i < aln->core.n_cigar; ++i) { char l="MIDNSHP=X"[bam1_cigar(aln)[i]&BAM_CIGAR_MASK]; //fprintf(stderr,"%c",l); if ( l == 'N' ) { isSpliced=TRUE; hasSimpleCigar=FALSE;++nSpliced;} if ( l != 'M' && l!='=' ) { hasSimpleCigar=FALSE;} } } //fprintf(stderr,"read %ld\n",num_alns); // isDuplicate,isNotPassingQualityControls, // isSpliced,isPAired,isPrimary,hasSimpleCigar,isSecondMateRead,isProperPair,nh,nm,qual/mapq,xs sprintf(sSQL,"INSERT into bam_index values (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,'%c')", isDuplicate,isNotPassingQualityControls, nSpliced,isPaired,isPrimary,isMapped,hasSimpleCigar,isSecondMateRead,isProperPair, (nh==0?0:bam_aux2i(nh)),(nm==0?0:bam_aux2i(nm)), aln->core.qual, (xs==0?' ':(bam_aux2A(xs)==0?' ':bam_aux2A(xs)))); sqlite3_exec(db, sSQL, NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); ++num_alns; PRINT_ALNS_PROCESSED(num_alns); } bam_close(in); sqlite3_exec(db, "END TRANSACTION;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); printf("\nImported %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock))/CLOCKS_PER_SEC); // Create the indexes startClock2 = clock(); // generating the indexes does not pay off //sqlite3_exec(db, INDEXES, NULL, NULL, &sErrMsg); //printf("Indexed %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock2))/CLOCKS_PER_SEC); printf("Total time: %4.2f seconds\n", ((double)(clock() - startClock))/CLOCKS_PER_SEC); sqlite3_close(db); return 0; }
// FIX MRNM and unaligned reads int main(int argc, char *argv[]) { bamFile in; long num_unmapped=0; long num_alns_pe=0; if (argc != 3) { fprintf(stderr, "Usage: bam_tophat2_pe_fix <in.bam> <out.bam>\n"); return 1; } in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); if (in == 0) { fprintf(stderr, "ERROR: Fail to open input BAM file %s\n", argv[1]); return 1; } int ref; unsigned long num_alns=0; // counts unsigned long unalign_mapq_fix=0; unsigned long mtid_fix=0; unsigned long mpos_fix=0; bamFile out; bam_header_t *header; header = bam_header_read(in); bam1_t *aln=bam_init1(); out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } bam_header_write(out,header); while(bam_read1(in,aln)>=0) { ++num_alns; if (aln->core.tid < 0) { // unaligned reads if ( aln->core.qual!=0 ) { //fprintf(stderr, "ERROR: Unaligned read with quality > 0 in line %lu\n",num_alns); aln->core.qual=0; unalign_mapq_fix++; } } //fprintf(stderr,"%s %c %d %d\n",bam1_qname(aln),(aln->core.tid<0?'U':'M'),aln->core.mtid,aln->core.mpos); if ( aln->core.flag & BAM_FPAIRED ) { //fprintf(stderr,"paired %d\n",(aln->core.flag & BAM_FMUNMAP)); // paired if ( aln->core.mtid <0 && !(aln->core.flag & BAM_FMUNMAP) ) { aln->core.flag |= BAM_FMUNMAP; aln->core.mpos=-1; mtid_fix++; } if ( aln->core.mpos <0 && !(aln->core.flag & BAM_FMUNMAP) ) { aln->core.flag |= BAM_FMUNMAP; aln->core.mtid=-1; mpos_fix++; } } bam_write1(out,aln); } bam_destroy1(aln); bam_close(in); bam_close(out); // fprintf(stderr,"unaligned MAPQ fixes: %lu\n",unalign_mapq_fix); fprintf(stderr,"unaligned mtid fixes: %lu\n",mtid_fix); fprintf(stderr,"unaligned mpos fixes: %lu\n",mpos_fix); return 0; }
/** * PRIVATE. Wander for a region. */ static inline int bfwork_obtain_region(bam_fwork_t *fwork, bam_region_t *region) { int err, bytes; bam1_t *read; double times; //Get first read if(last_read != NULL) { read = last_read; bytes = last_read_bytes; last_read = NULL; } else { //Get first read from file read = bam_init1(); assert(read); bytes = bam_read1(fwork->input_file->bam_fd, read); } //Iterate reads while(bytes > 0) { //Wander this read omp_set_lock(®ion->lock); #ifdef D_TIME_DEBUG times = omp_get_wtime(); #endif err = fwork->context->wander_f(fwork, region, read); #ifdef D_TIME_DEBUG times = omp_get_wtime() - times; if(fwork->context->time_stats) time_add_time_slot(D_FWORK_WANDER_FUNC, fwork->context->time_stats, times); #endif omp_unset_lock(®ion->lock); switch(err) { case WANDER_READ_FILTERED: //This read dont pass the filters case NO_ERROR: //Add read to region omp_set_lock(®ion->lock); region->reads[region->size] = read; region->size++; //Region is full? if(region->size >= region->max_size) { omp_unset_lock(®ion->lock); return WANDER_REGION_CHANGED; } omp_unset_lock(®ion->lock); //Get next read from file read = bam_init1(); assert(read); bytes = bam_read1(fwork->input_file->bam_fd, read); break; case WANDER_REGION_CHANGED: //The region have changed last_read = read; last_read_bytes = bytes; return err; default: //Unknown error LOG_ERROR_F("Framework fails with error code: %d\n", err); return err; } } //Check read error if(bytes <= 0) { //Destroy bam bam_destroy1(read); //End of file if(bytes == -1) { return WANDER_READ_EOF; } else { return WANDER_READ_TRUNCATED; } } return NO_ERROR; }
int main(int argc, char *argv[]) { short out2stdout=0; bamFile in,in2; bamFile out; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_se_flag <in.bam> <out.bam or - for stdout>\n"); return 1; } // Open file and exit if error in = bam_open(argv[1], "rb"); out2stdout = strcmp(argv[2], "-")? 0 : 1; out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); if (!out2stdout) { fprintf(stderr,"bam_fix_se_flag version %s\n",VERSION); fprintf(stderr,"Processing %s\n",argv[1]); } // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); num_alns=0; while(bam_read1(in2,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FPAIRED ) { // PAIRED } else { //SE //turn off the other pair related flags aln->core.flag&=~BAM_FPROPER_PAIR; aln->core.flag&=~BAM_FMUNMAP; aln->core.flag&=~BAM_FREAD1; aln->core.flag&=~BAM_FREAD2; fprintf(stderr, "."); } bam_write1(out,aln); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); ++num_alns; } // bam_destroy1(aln); bam_close(in2); bam_close(out); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Done.\n"); } return 0; }
int main(int argc, char *argv[]) { short out2stdout=0; hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; int paired;//1 if not paired or pair read 1, 2 otherwise index_mem=sizeof(hashtable)*sizeof(hashnode**)*HASHSIZE*2; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam or - for stdout>\n"); return 1; } // Open file and exit if error in = bam_open(argv[1], "rb"); out2stdout = strcmp(argv[2], "-")? 0 : 1; out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); if (!out2stdout) { fprintf(stderr,"bam_fix_NH version %s\n",VERSION); fprintf(stderr,"Processing %s\n",argv[1]); fprintf(stderr,"Hashing...\n");fflush(stderr); } while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; else paired=1; ++num_alns; new_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } bam_close(in); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Hashing complete (%lu alignments)\n",num_alns); fprintf(stderr,"Memory used: %ld MB\n",index_mem/1024/1024); fprintf(stderr,"Updating entries with NH and printing BAM...\n"); fflush(stderr); } // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); num_alns=0; while(bam_read1(in2,aln)>=0) { // read alignment paired=1; if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; ++num_alns; READ_ALN *r=get_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); int32_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG // printf("!>%s %d\n",bam1_qname(aln),r->ctr); #endif } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG fprintf(stderr,"!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } bam_write1(out,aln); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } // bam_destroy1(aln); bam_close(in2); bam_close(out); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Done.\n"); } return 0; }
int main(int argc, char *argv[]) { hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam>\n"); return 1; } // Open file and exit if error //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); in = bam_open(argv[1], "rb"); out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); printf("Hashing...\n");flush(stdout); while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; new_read_aln(ht,bam1_qname(aln)); } bam_close(in); printf("Hashing complete (%lu alignments)\n",num_alns); printf("Memory used in the hash: %ld MB\n",index_mem/1024/1024); flush(stdout); // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); while(bam_read1(in2,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; READ_ALN *r=get_read_aln(ht,bam1_qname(aln)); //assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); uint8_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG printf("!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } // in->header // Also fix the XS:A tag // BAM_FREAD1 // BAM_FREAD2 // BAM_FREVERSE the read is mapped to the reverse strand //bam1_cigar(b) //BAM_CREF_SKIP 3 CIGAR skip on the reference (e.g. spliced alignment) //BAM_FREVERSE 16 the read is mapped to the reverse strand if (aln->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (aln->core.flag & ! BAM_FPAIRED) continue; // not paired if (aln->core.flag & ! BAM_FPROPER_PAIR) continue; // not a proper pair if (aln->core.flag & ! BAM_FMUNMAP) continue; // the mate is mapped if (aln->core.flag & BAM_FSECONDARY) continue; // secundary read if (aln->core.flag & BAM_FREAD2) continue; // only count each pair once // core.strand == 0 (f/+) 1 r/- // flag // bam1_qname(b) bam_write1(out,aln); } // bam_destroy1(aln); bam_close(in2); bam_close(out); return 0; /* uint8_t *old_nm = bam_aux_get(b, "NM"); 90 if (c->flag & BAM_FUNMAP) return; 91 if (old_nm) old_nm_i = bam_aux2i(old_nm); 92 if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 93 else if (nm != old_nm_i) { 94 fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); 95 bam_aux_del(b, old_nm); 96 bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 97 } */ }
/* Read one pair from a bam file. Returns 1 if we got a singleton, 2 if * we got a pair, 0 if we reached EOF, -1 if something outside our * control went wrong, -2 if we got something unexpected (missing mate, * fragment with unexpected PE flags). */ static int read_bam_pair_core(bwa_seqio_t *bs, bam_pair_t *pair, int allow_broken) { static int num_wrong_pair = 128 ; memset(pair, 0, sizeof(bam_pair_t)) ; if (bam_read1(bs->fp, &pair->bam_rec[0]) < 0) return 0 ; while(1) { if (pair->bam_rec[0].core.flag & BAM_FPAIRED) { // paired read, get another if (bam_read1(bs->fp, &pair->bam_rec[1]) >= 0) { uint32_t flag1 = pair->bam_rec[0].core.flag & (BAM_FPAIRED|BAM_FREAD1|BAM_FREAD2); uint32_t flag2 = pair->bam_rec[1].core.flag & (BAM_FPAIRED|BAM_FREAD1|BAM_FREAD2); if (!strcmp(bam1_qname(&pair->bam_rec[0]), bam1_qname(&pair->bam_rec[1]))) { // actual mates if( flag1 == (BAM_FPAIRED|BAM_FREAD1) && flag2 == (BAM_FPAIRED|BAM_FREAD2) ) { // correct order pair->kind = proper_pair ; return 2 ; } else if (flag2 == (BAM_FPAIRED|BAM_FREAD1) && flag1 == (BAM_FPAIRED|BAM_FREAD2) ) { // reverse order memswap(&pair->bam_rec[0], &pair->bam_rec[1], sizeof(bam1_t)); pair->kind = proper_pair ; return 2 ; } else { fprintf( stderr, "[read_bam_pair] got a pair, but the flags are wrong (%s).\n", bam1_qname(&pair->bam_rec[0]) ) ; if( allow_broken ) { pair->bam_rec[0].core.flag &= ~BAM_FREAD2; pair->bam_rec[0].core.flag |= BAM_FPAIRED|BAM_FREAD1; pair->bam_rec[1].core.flag &= ~BAM_FREAD1; pair->bam_rec[1].core.flag |= BAM_FPAIRED|BAM_FREAD2; pair->kind = proper_pair ; return 2 ; } else return -2 ; } } else { // This is arguably wrong, we discard a lone mate. But what else could we do? Buffering it // somewhere to way is too hard for the time being, returning it as a single means we need to buffer the // next one. Not very appealing. So only two options remain: discard it or bail out. if( num_wrong_pair ) { fprintf( stderr, "[read_bam_pair] got two reads, but the names don't match (%s,%s).\n", bam1_qname(&pair->bam_rec[0]), bam1_qname(&pair->bam_rec[1]) ) ; --num_wrong_pair ; if( !num_wrong_pair ) fprintf( stderr, "[read_bam_pair] too many mismatched names, not reporting anymore.\n" ) ; } try_get_sai( bs->sai, flag1 & BAM_FREAD1 ? 1 : 2, &pair->bwa_seq[0].n_aln, &pair->bwa_seq[0].aln ) ; free(pair->bam_rec[0].data); if(pair->bwa_seq[0].n_aln) free(pair->bwa_seq[0].aln); if( !allow_broken ) { free(pair->bam_rec[1].data); if(pair->bwa_seq[0].n_aln) free(pair->bwa_seq[0].aln); return -2 ; } memmove(&pair->bam_rec[0], &pair->bam_rec[1], sizeof(bam1_t)); memset(&pair->bam_rec[1], 0, sizeof(bam1_t)); } } else { fprintf( stderr, "[read_bam_pair] got a paired read and hit EOF.\n" ) ; free(pair->bam_rec[0].data); if(pair->bwa_seq[0].n_aln) free(pair->bwa_seq[0].aln); return allow_broken ? 0 : -2 ; } } else { // singleton read pair->kind = singleton ; return 1 ; } } }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the output and the temporary files; upon sucessess, prefix.bam will be written. @param max_mem approxiate maximum memory (very inaccurate) @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout) { int n, ret, k, i; size_t mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; g_is_by_qname = is_by_qname; n = k = 0; mem = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); // write sub files for (;;) { if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; mem += ret; ++k; if (mem >= max_mem) { sort_blocks(n++, k, buf, prefix, header, 0); mem = 0; k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout); else { // then merge char **fns, *fnout; fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); sort_blocks(n++, k, buf, prefix, header, 0); fnout = (char*)calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); fns = (char**)calloc(n, sizeof(char*)); for (i = 0; i < n; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } bam_merge_core(is_by_qname, fnout, 0, n, fns, 0, 0); free(fnout); for (i = 0; i < n; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } for (k = 0; (size_t)k < max_mem / BAM_CORE_SIZE; ++k) { if (buf[k]) { free(buf[k]->data); free(buf[k]); } } free(buf); bam_header_destroy(header); bam_close(fp); }
int main_bam2fq(int argc, char *argv[]) { BGZF *fp, *fpse = 0; bam1_t *b; uint8_t *buf; int max_buf, c, has12 = 0; kstring_t str; int64_t n_singletons = 0, n_reads = 0; char last[512], *fnse = 0; while ((c = getopt(argc, argv, "as:")) > 0) if (c == 'a') has12 = 1; else if (c == 's') fnse = optarg; if (argc == optind) { fprintf(stderr, "\nUsage: bam2fq [-a] [-s outSE] <in.bam>\n\n"); fprintf(stderr, "Options: -a append /1 and /2 to the read name\n"); fprintf(stderr, " -s FILE write singleton reads to FILE [assume single-end]\n"); fprintf(stderr, "\n"); return 1; } fp = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r"); assert(fp); bam_hdr_destroy(bam_hdr_read(fp)); buf = 0; max_buf = 0; str.l = str.m = 0; str.s = 0; last[0] = 0; if (fnse) fpse = bgzf_open(fnse, "w1"); b = bam_init1(); while (bam_read1(fp, b) >= 0) { int i, qlen = b->core.l_qseq, is_print = 0; uint8_t *qual, *seq; if (b->flag&BAM_FSECONDARY) continue; // skip secondary alignments ++n_reads; if (fpse) { if (str.l && strcmp(last, bam_get_qname(b))) { bgzf_write(fpse, str.s, str.l); str.l = 0; ++n_singletons; } if (str.l) is_print = 1; strcpy(last, bam_get_qname(b)); } else is_print = 1; qual = bam_get_qual(b); kputc(qual[0] == 0xff? '>' : '@', &str); kputsn(bam_get_qname(b), b->core.l_qname - 1, &str); if (has12) { kputc('/', &str); kputw(b->core.flag>>6&3, &str); } kputc('\n', &str); if (max_buf < qlen + 1) { max_buf = qlen + 1; kroundup32(max_buf); buf = (uint8_t*)realloc(buf, max_buf); } buf[qlen] = 0; seq = bam_get_seq(b); for (i = 0; i < qlen; ++i) buf[i] = bam_seqi(seq, i); // copy the sequence if (bam_is_rev(b)) { // reverse complement for (i = 0; i < qlen>>1; ++i) { int8_t t = seq_comp_table[buf[qlen - 1 - i]]; buf[qlen - 1 - i] = seq_comp_table[buf[i]]; buf[i] = t; } if (qlen&1) buf[i] = seq_comp_table[buf[i]]; } for (i = 0; i < qlen; ++i) buf[i] = seq_nt16_str[buf[i]]; kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (qual[0] != 0xff) { kputsn("+\n", 2, &str); for (i = 0; i < qlen; ++i) buf[i] = 33 + qual[i]; if (bam_is_rev(b)) { // reverse for (i = 0; i < qlen>>1; ++i) { uint8_t t = buf[qlen - 1 - i]; buf[qlen - 1 - i] = buf[i]; buf[i] = t; } } } kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (is_print) { fwrite(str.s, 1, str.l, stdout); str.l = 0; } } if (fpse) { if (str.l) { bgzf_write(fpse, str.s, str.l); ++n_singletons; } fprintf(stderr, "[M::%s] discarded %lld singletons\n", __func__, (long long)n_singletons); bgzf_close(fpse); } fprintf(stderr, "[M::%s] processed %lld reads\n", __func__, (long long)n_reads); free(buf); free(str.s); bam_destroy1(b); bgzf_close(fp); return 0; }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); #ifdef USE_HTSLIB while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) { #else while ((res = bam_read1(bs->fp, b)) >= 0) { #endif uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; #ifdef USE_HTSLIB s = bam_get_seq(b); q = bam_get_qual(b); #else s = bam1_seq(b); q = bam1_qual(b); #endif p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { #ifdef USE_HTSLIB p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)]; #else p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; #endif p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } #ifdef USE_HTSLIB if (bam_is_rev(b)) { // then reverse #else if (bam1_strand(b)) { // then reverse #endif seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); #ifdef USE_HTSLIB p->name = strdup((const char*)bam_get_qname(b)); #else p->name = strdup((const char*)bam1_qname(b)); #endif if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; } #define BARCODE_LOW_QUAL 13 bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); return 0; } return seqs; } void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) { int i, j; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; for (j = 0; j < p->n_multi; ++j) if (p->multi[j].cigar) free(p->multi[j].cigar); free(p->name); free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); free(p->cigar); } free(seqs); }
void filterReads(char * inBamFile, char * outBamFile, int minMapQual, int minLen, int maxMisMatches, float minPcId, float minPcAln, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { // int result = -1; int outResult = -1; int supp_check = 0x0; if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // helper variables BGZF* in = 0; BGZF* out = 0; bam1_t *b = bam_init1(); bam_hdr_t *h; // open bam if ((in = bgzf_open(inBamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", inBamFile); } else if ((h = bam_hdr_read(in)) == 0) { // read header fprintf(stderr, "ERROR: Failed to read BAM header of file \"%s\".\n", inBamFile); } else if ((out = bgzf_open(outBamFile, "w")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for writing.\n", outBamFile); } else { // write and destroy header bam_hdr_write(out, h); bam_hdr_destroy(h); int line = 0; int matches, mismatches, qLen; float pcAln, pcId; int showStats = 0; // fetch alignments while ((result = bam_read1(in, b)) >= 0) { line += 1; // only primary mappings if ((b->core.flag & supp_check) != 0) { if (showStats) fprintf(stdout, "Rejected %d, non-primary\n", line); continue; } // only high quality if (b->core.qual < minMapQual) { if (showStats) fprintf(stdout, "Rejected %d, quality: %d\n", line, b->core.qual); continue; } // not too many absolute mismatches mismatches = bam_aux2i(bam_aux_get(b, "NM")); if (mismatches > maxMisMatches) { if (showStats) fprintf(stdout, "Rejected %d, mismatches: %d\n", line, mismatches); continue; } // not too short qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b)); if (qLen < minLen) { if (showStats) fprintf(stdout, "Rejected %d, length: %d\n", line, qLen); continue; } // only high percent identity matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b)); pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1 if (pcId < minPcId) { if (showStats) fprintf(stdout, "Rejected %d, identity pc: %.4f\n", line, pcId); continue; } // only high percent alignment pcAln = matches / (float)qLen; // percentage as float between 0 to 1 if (pcAln < minPcAln) { if (showStats) fprintf(stdout, "Rejected %d, alignment pc: %.4f\n", line, pcAln); continue; } if ((outResult = bam_write1(out, b)) < -1) { fprintf(stderr, "ERROR: Attempt to write read no. %d to file \"%s\" failed with code %d.\n", line, outBamFile, outResult); } } if (result < -1) { fprintf(stderr, "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n", line, inBamFile, result); } } if (in) bgzf_close(in); if (out) bgzf_close(out); bam_destroy1(b); }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the output and the temporary files; upon sucessess, prefix.bam will be written. @param max_mem approxiate maximum memory (very inaccurate) @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int sort_type) { int ret, i, n_files = 0; size_t mem, max_k, k, max_mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; char *fnout = 0; if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; max_k = k = 0; mem = 0; max_mem = _max_mem * n_threads; buf = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); // write sub files for (;;) { if (k == max_k) { size_t old_max = max_k; max_k = max_k? max_k<<1 : 0x10000; buf = realloc(buf, max_k * sizeof(void*)); memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max)); } if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; if (b->data_len < b->m_data>>2) { // shrink b->m_data = b->data_len; kroundup32(b->m_data); b->data = realloc(b->data, b->m_data); } mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; if (mem >= max_mem) { n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); mem = k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); // output file name fnout = calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); // write the final output if (n_files == 0) { // a single block char mode[8]; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); sort_aux_core(k, buf, sort_type); #ifndef _PBGZF_USE write_buffer(fnout, mode, k, buf, header, n_threads); #else write_buffer(fnout, mode, k, buf, header); #endif } else { // then merge char **fns; n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); fns = (char**)calloc(n_files, sizeof(char*)); for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } #ifndef _PBGZF_USE bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level); #else bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, level); #endif for (i = 0; i < n_files; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } free(fnout); // free for (k = 0; k < max_k; ++k) { if (!buf[k]) continue; free(buf[k]->data); free(buf[k]); } free(buf); bam_header_destroy(header); bam_close(fp); }
int main(int argc,char* argv[]) { time_t timestamp, current; int i,j,k; int a,n; char *pc; FILE *input_file; FILE *output_file; FILE* log_file=stderr; bamFile bam_input; bam_header_t *header; bam1_t* b; bam1_core_t *c; char cps_file_name[MAXFILEBUFFLENGTH]=""; char bam_file_name[MAXFILEBUFFLENGTH]=""; char out_file_name[MAXFILEBUFFLENGTH]=""; char log_file_name[MAXFILEBUFFLENGTH]=""; char buff[MAXFILEBUFFLENGTH]; char chr[MAXFILEBUFFLENGTH]; int beg, beg_prev, end, pos, offset; int ref_id, ref_id_prev, label; int s, side; int read_type, mapped_strand; char ch; int limit_counts = 0; int* contig_count[2]; int* contig_index[2]; splice_site** contig_sites[2]; long int n_reads[N_READ_TYPES][2]; long int n_total_reads = 0; long int n_skipped_reads = 0; int max_intron_length=0; int min_intron_length=0; int ignore_gene_labels = 0; int stranded = 1; int rev_compl[2] = {1,0}; int other_end, the_end, donor_id, acceptor_id; int *cigar; int flagged = 0; int margin = 4; /** reading input from the command line **/ timestamp = time(NULL); if(argc==1) { fprintf(stderr, "BAM2SSJ is the utility for fast counting reads covering splice junctions\nCommand line use:\n"); fprintf(stderr, "%s -cps <cps_file> -bam <bam_file> [-out <out_file>] [-log <log_file>] [-maxlen <max_intron_length>] [-minlen <min_intron_length>] [-margin <length>] ",argv[0]); fprintf(stderr, "[-v suppress verbose output] [-read1 0/1] [-read2 0/1] [-g ignore gene labels] [-u unstranded] [-f count reads flagged 0x800 only]\ntype %s -h for more info\n",argv[0]); exit(1); } for(i=1;i<argc;i++) { pc = argv[i]; if(*pc == '-') { if(strcmp(pc+1,"cps") == 0) sscanf(argv[++i], "%s", &cps_file_name[0]); if(strcmp(pc+1,"bam") == 0) sscanf(argv[++i], "%s", &bam_file_name[0]); if(strcmp(pc+1,"out") == 0) sscanf(argv[++i], "%s", &out_file_name[0]); if(strcmp(pc+1,"log") == 0) sscanf(argv[++i], "%s", &log_file_name[0]); if(strcmp(pc+1,"read1") == 0) sscanf(argv[++i], "%i", &rev_compl[0]); if(strcmp(pc+1,"read2") == 0) sscanf(argv[++i], "%i", &rev_compl[1]); if(strcmp(pc+1,"lim") == 0) sscanf(argv[++i], "%i", &limit_counts); if(strcmp(pc+1,"minlen") == 0) sscanf(argv[++i], "%i", &min_intron_length); if(strcmp(pc+1,"maxlen") == 0) sscanf(argv[++i], "%i", &max_intron_length); if(strcmp(pc+1,"margin") == 0) sscanf(argv[++i], "%i", &margin); if(strcmp(pc+1,"v") == 0) verbose = 0; if(strcmp(pc+1,"g") == 0) ignore_gene_labels = 1; if(strcmp(pc+1,"u") == 0) stranded = 0; if(strcmp(pc+1,"f") == 0) flagged = 1; if(strcmp(pc+1,"h") ==0 ) { fprintf(stderr, "Input: (1) sorted BAM file\n"); fprintf(stderr, "\t(2) CPS (chromosome-position-strand) tab-delimited file sorted by position (chr1 100 + etc)\n\n"); fprintf(stderr, "\tIn order to get CPS file from gtf, use the utility gtf2cps.sh\n"); fprintf(stderr, "\tImportant: CPS must be sorted by position ONLY!\n\n"); fprintf(stderr, "\tIf the 4th column contains (a numeric) gene label then only splice junctions within the same gene will be considered (unless the '-g' option is active)\n"); fprintf(stderr, "\tThe utility to generate CPS with gene labels is gtf2cps_with_gene_id.sh (or update the script accordingly if you are using genome other than human)\n\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, "\t-maxlen <upper limit on intron length>; 0 = no limit (default=%i)\n",max_intron_length); fprintf(stderr, "\t-minlen <lower limit on intron length>; 0 = no limit (default=%i)\n",min_intron_length); fprintf(stderr, "\t-margin <length> minimum number of flanking nucleotides in the read in order to support SJ or cover EB, (default=%i)\n",margin); fprintf(stderr, "\t-read1 0/1, reverse complement read1 no/yes (default=%i)\n",rev_compl[0]); fprintf(stderr, "\t-read2 0/1, reverse complement read2 no/yes (default=%i)\n",rev_compl[1]); fprintf(stderr, "\t-g ignore gene labels (column 4 of cps), default=%s\n", ignore_gene_labels ? "ON" : "OFF"); fprintf(stderr, "\t-u ignore strand (all reads map to the correct strand), default=%s\n", stranded ? "OFF" : "ON"); fprintf(stderr, "\t-f count only reads that are flagged 0x800 (uniquely mapped reads), default=%s\n", flagged ? "ON" : "OFF"); fprintf(stderr, "Output: tab-delimited (default=stdout)\n"); fprintf(stderr, "\tColumn 1 is splice_junction_id\n"); fprintf(stderr, "\tColumns 2-6 are counts of 53, 5X, X3, 50, and 03 reads for the correct (annotated) strand\n"); fprintf(stderr, "\tColumns 7-11 are similar counts for the incorrect (opposite to annotated) strand\n"); fprintf(stderr, "Descriptive read statistics are reported to stderr\n"); exit(1); } } } if(log_file_name[0]==0) { log_file = stderr; } else { log_file = fopen(log_file_name,"w"); if(log_file == NULL) log_file = stderr; } if(bam_file_name[0]==0) { fprintf(log_file,"Bam not specified, exiting\n"); exit(1); } if(cps_file_name[0]==0) { fprintf(log_file,"Input not specified, exiting\n"); exit(1); } if(out_file_name[0]==0) { fprintf(log_file,"[Warning: output set to stdout]\n"); output_file = stdout; } else { output_file = fopen(out_file_name,"w"); if(output_file == NULL) { fprintf(log_file,"[Warning: output set to stdout]\n"); output_file = stdout; } } if(max_intron_length>0) { if(verbose) fprintf(log_file,"[Warning: set max intron length=%i]\n",max_intron_length); } if(ignore_gene_labels) { if(verbose) fprintf(log_file,"[Warning: ignoring gene labels (column 4)]\n"); } if(flagged) { if(verbose) fprintf(log_file,"[Warning: only look at reads flagged 0x800]\n"); } if(margin>0) { if(verbose) fprintf(log_file,"[Warning: read margin set to %i]\n", margin); } if(verbose) { for(s = 0; s < 2; s++) if(rev_compl[s]) fprintf(log_file,"[Warning: take reverse complement of read %i]\n", s+1); fprintf(log_file,"[Warning: stranded = %s]\n", stranded ? "TRUE" : "FALSE (always correct strand)"); if(ignore_gene_labels) fprintf(log_file,"[Warning: ignore gene labels (column 4)]\n"); } for(i = 0; i < N_READ_TYPES; i++) for(s = 0; s < 2; s++) n_reads[i][s] = 0; /** initatializing BAM and header **/ bam_input = bam_open(bam_file_name, "r"); header = bam_header_read(bam_input); if(bam_input == NULL || header == NULL) { fprintf(log_file,"BAM can't be opened or contains no header, exiting\n"); exit(1); } /** reading input from CPS **/ input_file = fopen(cps_file_name, "r"); if(input_file == NULL) { fprintf(log_file,"CPS can't be opened, exiting\n"); exit(1); } /** populating gene structure arrays **/ for(s = 0; s < 2; s++) { contig_count[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN)); contig_index[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN)); contig_sites[s] = (splice_site**) malloc(sizeof(splice_site*) * (header->n_targets + ARRAY_MARGIN)); if(contig_count[s] == NULL || contig_sites[s] == NULL || contig_index[s] == NULL) { fprintf(log_file, "Not enought memory, exiting\n"); exit(1); } } for(s = 0; s < 2; s++) for(i=0; i < header->n_targets; i++) contig_count[s][i] = contig_index[s][i] = 0; if(verbose) fprintf(log_file, "Reading %s pass1", cps_file_name); while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) { sscanf(buff, "%s %*i %c", &chr[0], &ch); bam_parse_region(header, chr, &i, &beg, &end); s = (ch == '+' ? 0 : 1); if(i < header->n_targets && i>=0) contig_count[s][i]++; } for(s = 0; s < 2; s++) { for(i = 0;i < header->n_targets; i++) { contig_sites[s][i] = (splice_site*) malloc(sizeof(splice_site) * (contig_count[s][i] + ARRAY_MARGIN)); if(contig_sites[s][i] == NULL) { fprintf(log_file, "Not enought memory, exiting\n"); exit(1); } } } if(verbose) fprintf(log_file, "\n"); if(verbose) fprintf(log_file, "Reading %s pass2",cps_file_name); fseek(input_file, 0, SEEK_SET); while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) { sscanf(buff, "%s %i %c %i", &chr[0], &pos, &ch, &label); bam_parse_region(header, chr, &i, &beg, &end); s = (ch == '+' ? 0 : 1); if(i < header->n_targets && i>=0) { if(contig_index[s][i]>0) { if(pos < contig_sites[s][i][contig_index[s][i]-1].pos) { fprintf(log_file, "Splice sites weren't sorted, exiting\n"); exit(1); } } contig_sites[s][i][contig_index[s][i]].pos = pos; contig_sites[s][i][contig_index[s][i]].label = ignore_gene_labels ? 0 : label; for(side = 0; side < 2; side++) { contig_sites[s][i][contig_index[s][i]].count00[side] = 0; contig_sites[s][i][contig_index[s][i]].count5X[side] = 0; contig_sites[s][i][contig_index[s][i]].countX3[side] = 0; contig_sites[s][i][contig_index[s][i]].junctions = NULL; } contig_index[s][i]++; } } if(verbose) fprintf(log_file, "\n"); for(s = 0; s < 2; s++) for(i = 0;i < header->n_targets; i++) contig_index[s][i] = 0; /** analysis starts here **/ b = bam_init1(); k = 0; ref_id_prev = -1; beg_prev = -1; while(bam_read1(bam_input, b)>=0) { c = &b->core; ref_id = c->tid; if(ref_id<0) continue; if(flagged && ((c->flag & 0x800) == 0)) { n_skipped_reads++; continue; } if(stranded && ((c->flag & BAM_FREAD1) && (c->flag & BAM_FREAD2) || !(c->flag & BAM_FREAD1) && !(c->flag & BAM_FREAD2))) { n_skipped_reads++; continue; } cigar = bam1_cigar(b); if(ref_id != ref_id_prev && ref_id_prev >= 0) { if(contig_index[0][ref_id_prev] + contig_index[1][ref_id_prev] < contig_count[0][ref_id_prev] + contig_count[1][ref_id_prev]) { if(log_file==stderr) progressbar(1, 1, header->target_name[ref_id_prev], verbose); } beg_prev = -1; } /*if(ref_id < ref_id_prev) { fprintf(log_file,"BAM file wasn't sorted, exiting\n"); exit(1); }*/ ref_id_prev = ref_id; beg = c->pos + 1; if(beg < beg_prev) { fprintf(log_file,"BAM file wasn't sorted, exiting\n"); exit(1); } beg_prev = beg; s = ((c->flag & BAM_FREVERSE)>0); mapped_strand = (c->flag & BAM_FREAD1) ? (s + rev_compl[0]) & 1 : (s + rev_compl[1]) & 1; the_end = bam_calend(c, cigar); for(s = 0; s < 1 + stranded; s++) { end = beg; side = (s == mapped_strand) ? 0 : 1; side *= stranded; // keep reading until the currect site is on the same chromosome downstream of the read while(contig_sites[s][ref_id][contig_index[s][ref_id]].pos < beg && contig_index[s][ref_id] < contig_count[s][ref_id]) { contig_index[s][ref_id]++; if(log_file==stderr) progressbar(contig_index[0][ref_id]+contig_index[1][ref_id], contig_count[0][ref_id]+contig_count[1][ref_id], header->target_name[ref_id], verbose); } read_type = RT_OTHER; if(contig_index[s][ref_id]<contig_count[s][ref_id]) { // check if the read is a split read and find its other end read_type = RT_GENOME; for(i = 0; i < c->n_cigar; i++) { offset = cigar[i] >> 4; switch(cigar[i] & 0x0F) { case BAM_CMATCH: end += offset; // match to the reference break; case BAM_CINS: end += 0; // insertion to the reference, pointer stays unchanged break; case BAM_CDEL: end += offset; // deletion from the reference (technically the same as 'N') pointer moves break; case BAM_CREF_SKIP: other_end = end + offset; donor_id = acceptor_id = -INFTY; if(end - beg < margin) break; if(the_end - other_end < margin) break; for(j = contig_index[s][ref_id]; contig_sites[s][ref_id][j].pos <= other_end && j < contig_count[s][ref_id];j++) { if(contig_sites[s][ref_id][j].pos - end < min_intron_length && min_intron_length > 0) continue; if(contig_sites[s][ref_id][j].pos - end > max_intron_length && max_intron_length > 0) break; if(contig_sites[s][ref_id][j].label == contig_sites[s][ref_id][contig_index[s][ref_id]].label) { if(contig_sites[s][ref_id][j].pos == end - 1) donor_id = j; if(contig_sites[s][ref_id][j].pos == other_end) acceptor_id = j; } } if(donor_id>0 && acceptor_id>0) { update_count(&contig_sites[s][ref_id][donor_id].junctions, acceptor_id, side); contig_sites[s][ref_id][donor_id].count5X[side]++; contig_sites[s][ref_id][acceptor_id].countX3[side]++; read_type = RT_KJUNCT; } else { read_type = RT_UJUNCT; } end = other_end; break; case BAM_CSOFT_CLIP: case BAM_CHARD_CLIP: case BAM_CPAD: break; default: read_type = RT_OTHER; } } if(read_type == RT_GENOME) { for(j=contig_index[s][ref_id]; beg + margin <= contig_sites[s][ref_id][j].pos && contig_sites[s][ref_id][j].pos < end - margin && j<contig_count[s][ref_id]; j++) { contig_sites[s][ref_id][j].count00[side]++; read_type = RT_OVRLAP; k++; } } } n_reads[read_type][side]++; } n_total_reads++; if(k>limit_counts && limit_counts>0) break; }
void profileReads(char* bamFile, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { // int result = -1; int supp_check = 0x0; if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // helper variables BGZF* in = 0 ; bam1_t *b = bam_init1(); bam_hdr_t *h; // open bam if ((in = bgzf_open(bamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", bamFile); } else if ((h = bam_hdr_read(in)) == 0) { // read header fprintf(stderr, "ERROR: Failed to read BAM header of file \"%s\".\n", bamFile); } else { // destroy header bam_hdr_destroy(h); int line = 0; int supplementary, secondary; int mapQual; int matches, mismatches, qLen; float pcAln, pcId; int showStats = 0; uint8_t *aux_mismatches; // print header printf("line\tsupp\tsecondary\tmapQ\tmismatches\tmatches\tqLen\tpcId\tpcAln\n"); // fetch alignments while ((result = bam_read1(in, b)) >= 0) { line += 1; // only primary mappings if ((b->core.flag & supp_check) != 0) { if (showStats) fprintf(stdout, "Rejected %d, non-primary\n", line); continue; } supplementary = (b->core.flag & (1 | BAM_FSUPPLEMENTARY)) != 0; secondary = (b->core.flag & (1 | BAM_FSECONDARY)) != 0; // quality mapQual = b->core.qual; // bam_aux_get returns 0 if optional NM tag is missing if ((aux_mismatches = bam_aux_get(b, "NM"))) mismatches = bam_aux2i(aux_mismatches); else mismatches = 0; // length qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b)); // percent identity matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b)); pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1 // percent alignment pcAln = matches / (float)qLen; // percentage as float between 0 to 1 // print read values printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\t%.4f\n", line, supplementary, secondary, mapQual, mismatches, matches, qLen, pcId, pcAln); } if (result < -1) { fprintf(stderr, "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n", line, bamFile, result); } } if (in) bgzf_close(in); bam_destroy1(b); }