// currently, this function ONLY works if each read has one hit void bam_mating_core(bamFile in, bamFile out) { bam_header_t *header; bam1_t *b[2]; int curr, has_prev, pre_end = 0, cur_end; kstring_t str; str.l = str.m = 0; str.s = 0; header = bam_header_read(in); bam_header_write(out, header); b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while (bam_read1(in, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.tid < 0) continue; cur_end = bam_calend(&cur->core, bam1_cigar(cur)); if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (has_prev) { if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; else cur->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; else pre->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } bam_template_cigar(pre, cur, &str); bam_write1(out, pre); bam_write1(out, cur); has_prev = 0; } else { // unpaired or singleton pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; if (pre->core.flag & BAM_FPAIRED) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; } bam_write1(out, pre); } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (has_prev) bam_write1(out, b[1-curr]); bam_header_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); }
int bam_pad2unpad(bamFile in, bamFile out) { bam_header_t *h; bam1_t *b; kstring_t r, q; uint32_t *cigar2 = 0; int n2 = 0, m2 = 0, *posmap = 0; h = bam_header_read(in); bam_header_write(out, h); b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; while (bam_read1(in, b) >= 0) { uint32_t *cigar = bam1_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { int i, k; unpad_seq(b, &r); write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = realloc(posmap, r.m * sizeof(int)); for (i = k = 0; i < r.l; ++i) { posmap[i] = k; // note that a read should NOT start at a padding if (r.s[i]) ++k; } } else { int i, k, op; unpad_seq(b, &q); if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]); for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); for (i = k = 1, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH) cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0; for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); b->core.pos = posmap[b->core.pos]; } bam_write1(out, b); } free(r.s); free(q.s); free(posmap); bam_destroy1(b); bam_header_destroy(h); return 0; }
int samwrite(samfile_t *fp, const bam1_t *b) { if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); else { char *s = bam_format1_core(fp->header, b, fp->type>>2&3); int l = strlen(s); fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw); free(s); return l + 1; } }
int main(int argc, char** argv) { if(argc < 3) { printf("No input nor output files provided"); return -1; } bamFile in = bam_open(argv[1], "r"); bam_header_t* header; if (in == NULL) { printf("opening input file failed"); return -1; } bam1_t* b = bam_init1(); bamFile out = bam_open(argv[2], "w"); if (out == NULL) { printf("opening input file failed"); return -1; } header = bam_header_read(in); if(bam_header_write(out, header) < 0) { printf("writing header failed"); } long nextPrunedId; if(!scanf ("%lu", &nextPrunedId)) { printf("warning: no ids provided"); return -1; } long id = 0; while (bam_read1(in, b) >= 0) { // write BAM back if (nextPrunedId != id++) { bam_write1(out, b); } else { // fprintf(stderr, "pruning: id: %lu, pos: %d, length: %d\n", nextPrunedId, b->core.pos, b->core.l_qseq); if(!scanf ("%lu", &nextPrunedId)) { break; } } } // closing all resources bam_header_destroy(header); bam_close(in); bam_close(out); bam_destroy1(b); return 0; }
int samwrite(samfile_t *fp, const bam1_t *b) { if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); else { char *s = bam_format1_core(fp->header, b, fp->type>>2&3); int l = strlen(s); fp->x.tamw.writer(fp->x.tamw.writer_data, (uint8_t*) s, strlen(s)); fp->x.tamw.writer(fp->x.tamw.writer_data, (uint8_t*) "\n", 1); free(s); return l + 1; } }
void convert_sam_to_bam(char* sam_input, char* bam_input) { bam1_t* bam_p = bam_init1(); LOG_DEBUG("CONVERT-START: sam to bam\n"); //open SAM file for read if (time_flag) { start_timer(t1_convert); } tamFile sam_fd = sam_open(sam_input); //open BAM file for write bam_file_t* bam_file_p = bam_fopen_mode(bam_input, NULL, "w"); //read header from SAM file bam_header_t* bam_header_p = sam_header_read(sam_fd); //write header to BAM file bam_header_write(bam_file_p->bam_fd, bam_header_p); //write alignments to BAM file while (sam_read1(sam_fd, bam_header_p, bam_p) > 0) { bam_write1(bam_file_p->bam_fd, bam_p); num_alignments++; } //close BAM and SAM files, free bam alignment and bam file object bam_fclose(bam_file_p); sam_close(sam_fd); bam_header_destroy(bam_header_p); bam_destroy1(bam_p); if (time_flag) { stop_timer(t1_convert, t2_convert, convert_time); } //number_of_batchs = 1, convention value for statistics (not real batch) number_of_batchs = 1; }
static int uniform_fetch_func(bam1_t *b, void *data) { uint8_t *to_delete; data_t_uniform *tmp = (data_t_uniform*)data; bam1_core_t *c = &b->core; char *iq; char *dq; iq = malloc((c->l_qseq+1) * sizeof(char)); memset(iq, tmp->iq, c->l_qseq); iq[c->l_qseq] = '\0'; to_delete = bam_aux_get(b, BI_TAG); if (to_delete) { bam_aux_del(b, to_delete); } bam_aux_append(b, BI_TAG, 'Z', c->l_qseq+1, (uint8_t*) iq); dq = malloc((c->l_qseq+1) * sizeof(char)); memset(dq, tmp->dq, c->l_qseq); dq[c->l_qseq] = '\0'; to_delete = bam_aux_get(b, BD_TAG); if (to_delete) { bam_aux_del(b, to_delete); } bam_aux_append(b, BD_TAG, 'Z', c->l_qseq+1, (uint8_t*) dq); bam_write1(tmp->out, b); free(iq); free(dq); return 0; }
static int dindel_fetch_func(bam1_t *b, void *data) { data_t_dindel *tmp = (data_t_dindel*)data; bam1_core_t *c = &b->core; int rlen; uint8_t *to_delete; /* don't change reads failing default mask: BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP */ if (c->flag & BAM_DEF_MASK) { /* fprintf(stderr, "skipping read: %s at pos %d\n", bam1_qname(b), c->pos); */ bam_write1(tmp->out, b); return 0; } /* get the reference sequence and compute homopolymer array */ if (tmp->tid != c->tid) { /*fprintf(stderr, "fetching reference sequence %s\n", tmp->in->header->target_name[c->tid]); */ char *ref = fai_fetch(tmp->fai, tmp->in->header->target_name[c->tid], &rlen); strtoupper(ref);/* safeguard */ int rlen = strlen(ref); tmp->tid = c->tid; if (tmp->hpcount) free(tmp->hpcount); tmp->hpcount = (int*)malloc(rlen*sizeof(int)); find_homopolymers(ref, tmp->hpcount, rlen); free(ref); tmp->rlen = rlen; /* fprintf(stderr, "fetched reference sequence\n");*/ } /* parse the cigar string */ uint32_t *cigar = bam1_cigar(b); uint8_t indelq[c->l_qseq+1]; /* fprintf(stderr, "l_qseq:%d\n", c->l_qseq); */ int i; int x = c->pos; /* coordinate on reference */ int y = 0; /* coordinate on query */ for (i = 0; i < c->n_cigar; ++i) { int j, oplen = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < oplen; j++) { /*fprintf(stderr, "query:%d, ref:%d, count:%d\n", y, x, tmp->hpcount[x+1]); */ /* FIXME clang complains: The left operand of '>' is a garbage value */ indelq[y] = (x > tmp->rlen-2) ? DINDELQ[0] : (tmp->hpcount[x+1]>18 ? DINDELQ[0] : DINDELQ[tmp->hpcount[x+1]]); x++; y++; } } else if (op == BAM_CHARD_CLIP) { /* do nothing */ } else if (op == BAM_CDEL) { x += oplen; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { for (j = 0; j < oplen; j++) { /* fprintf(stderr, "query:%d, ref:%d\n", y, x); */ indelq[y] = DINDELQ[0]; y++; } } else { LOG_FATAL("unknown op %d for read %s\n", op, bam1_qname(b));/* FIXME skip? seen this somewhere else properly handled */ exit(1); } } indelq[y] = '\0'; to_delete = bam_aux_get(b, BI_TAG); if (to_delete) { bam_aux_del(b, to_delete); } bam_aux_append(b, BI_TAG, 'Z', c->l_qseq+1, indelq); to_delete = bam_aux_get(b, BD_TAG); if (to_delete) { bam_aux_del(b, to_delete); } bam_aux_append(b, BD_TAG, 'Z', c->l_qseq+1, indelq); bam_write1(tmp->out, b); return 0; }
void filterReads(char * inBamFile, char * outBamFile, int minMapQual, int minLen, int maxMisMatches, float minPcId, float minPcAln, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { // int result = -1; int outResult = -1; int supp_check = 0x0; if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // helper variables BGZF* in = 0; BGZF* out = 0; bam1_t *b = bam_init1(); bam_hdr_t *h; // open bam if ((in = bgzf_open(inBamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", inBamFile); } else if ((h = bam_hdr_read(in)) == 0) { // read header fprintf(stderr, "ERROR: Failed to read BAM header of file \"%s\".\n", inBamFile); } else if ((out = bgzf_open(outBamFile, "w")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for writing.\n", outBamFile); } else { // write and destroy header bam_hdr_write(out, h); bam_hdr_destroy(h); int line = 0; int matches, mismatches, qLen; float pcAln, pcId; int showStats = 0; // fetch alignments while ((result = bam_read1(in, b)) >= 0) { line += 1; // only primary mappings if ((b->core.flag & supp_check) != 0) { if (showStats) fprintf(stdout, "Rejected %d, non-primary\n", line); continue; } // only high quality if (b->core.qual < minMapQual) { if (showStats) fprintf(stdout, "Rejected %d, quality: %d\n", line, b->core.qual); continue; } // not too many absolute mismatches mismatches = bam_aux2i(bam_aux_get(b, "NM")); if (mismatches > maxMisMatches) { if (showStats) fprintf(stdout, "Rejected %d, mismatches: %d\n", line, mismatches); continue; } // not too short qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b)); if (qLen < minLen) { if (showStats) fprintf(stdout, "Rejected %d, length: %d\n", line, qLen); continue; } // only high percent identity matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b)); pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1 if (pcId < minPcId) { if (showStats) fprintf(stdout, "Rejected %d, identity pc: %.4f\n", line, pcId); continue; } // only high percent alignment pcAln = matches / (float)qLen; // percentage as float between 0 to 1 if (pcAln < minPcAln) { if (showStats) fprintf(stdout, "Rejected %d, alignment pc: %.4f\n", line, pcAln); continue; } if ((outResult = bam_write1(out, b)) < -1) { fprintf(stderr, "ERROR: Attempt to write read no. %d to file \"%s\" failed with code %d.\n", line, outBamFile, outResult); } } if (result < -1) { fprintf(stderr, "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n", line, inBamFile, result); } } if (in) bgzf_close(in); if (out) bgzf_close(out); bam_destroy1(b); }
int main(int argc, char *argv[]) { short out2stdout=0; hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; int paired;//1 if not paired or pair read 1, 2 otherwise index_mem=sizeof(hashtable)*sizeof(hashnode**)*HASHSIZE*2; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam or - for stdout>\n"); return 1; } // Open file and exit if error in = bam_open(argv[1], "rb"); out2stdout = strcmp(argv[2], "-")? 0 : 1; out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); if (!out2stdout) { fprintf(stderr,"bam_fix_NH version %s\n",VERSION); fprintf(stderr,"Processing %s\n",argv[1]); fprintf(stderr,"Hashing...\n");fflush(stderr); } while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; else paired=1; ++num_alns; new_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } bam_close(in); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Hashing complete (%lu alignments)\n",num_alns); fprintf(stderr,"Memory used: %ld MB\n",index_mem/1024/1024); fprintf(stderr,"Updating entries with NH and printing BAM...\n"); fflush(stderr); } // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); num_alns=0; while(bam_read1(in2,aln)>=0) { // read alignment paired=1; if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; ++num_alns; READ_ALN *r=get_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); int32_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG // printf("!>%s %d\n",bam1_qname(aln),r->ctr); #endif } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG fprintf(stderr,"!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } bam_write1(out,aln); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } // bam_destroy1(aln); bam_close(in2); bam_close(out); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Done.\n"); } return 0; }
int main(int argc, char *argv[]) { hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam>\n"); return 1; } // Open file and exit if error //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); in = bam_open(argv[1], "rb"); out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); printf("Hashing...\n");flush(stdout); while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; new_read_aln(ht,bam1_qname(aln)); } bam_close(in); printf("Hashing complete (%lu alignments)\n",num_alns); printf("Memory used in the hash: %ld MB\n",index_mem/1024/1024); flush(stdout); // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); while(bam_read1(in2,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; READ_ALN *r=get_read_aln(ht,bam1_qname(aln)); //assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); uint8_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG printf("!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } // in->header // Also fix the XS:A tag // BAM_FREAD1 // BAM_FREAD2 // BAM_FREVERSE the read is mapped to the reverse strand //bam1_cigar(b) //BAM_CREF_SKIP 3 CIGAR skip on the reference (e.g. spliced alignment) //BAM_FREVERSE 16 the read is mapped to the reverse strand if (aln->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (aln->core.flag & ! BAM_FPAIRED) continue; // not paired if (aln->core.flag & ! BAM_FPROPER_PAIR) continue; // not a proper pair if (aln->core.flag & ! BAM_FMUNMAP) continue; // the mate is mapped if (aln->core.flag & BAM_FSECONDARY) continue; // secundary read if (aln->core.flag & BAM_FREAD2) continue; // only count each pair once // core.strand == 0 (f/+) 1 r/- // flag // bam1_qname(b) bam_write1(out,aln); } // bam_destroy1(aln); bam_close(in2); bam_close(out); return 0; /* uint8_t *old_nm = bam_aux_get(b, "NM"); 90 if (c->flag & BAM_FUNMAP) return; 91 if (old_nm) old_nm_i = bam_aux2i(old_nm); 92 if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 93 else if (nm != old_nm_i) { 94 fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); 95 bam_aux_del(b, old_nm); 96 bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 97 } */ }
int main(int argc, char *argv[]) { short out2stdout=0; bamFile in,in2; bamFile out; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_se_flag <in.bam> <out.bam or - for stdout>\n"); return 1; } // Open file and exit if error in = bam_open(argv[1], "rb"); out2stdout = strcmp(argv[2], "-")? 0 : 1; out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); if (!out2stdout) { fprintf(stderr,"bam_fix_se_flag version %s\n",VERSION); fprintf(stderr,"Processing %s\n",argv[1]); } // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); num_alns=0; while(bam_read1(in2,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FPAIRED ) { // PAIRED } else { //SE //turn off the other pair related flags aln->core.flag&=~BAM_FPROPER_PAIR; aln->core.flag&=~BAM_FMUNMAP; aln->core.flag&=~BAM_FREAD1; aln->core.flag&=~BAM_FREAD2; fprintf(stderr, "."); } bam_write1(out,aln); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); ++num_alns; } // bam_destroy1(aln); bam_close(in2); bam_close(out); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Done.\n"); } return 0; }
// FIX MRNM and unaligned reads int main(int argc, char *argv[]) { bamFile in; long num_unmapped=0; long num_alns_pe=0; if (argc != 3) { fprintf(stderr, "Usage: bam_tophat2_pe_fix <in.bam> <out.bam>\n"); return 1; } in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); if (in == 0) { fprintf(stderr, "ERROR: Fail to open input BAM file %s\n", argv[1]); return 1; } int ref; unsigned long num_alns=0; // counts unsigned long unalign_mapq_fix=0; unsigned long mtid_fix=0; unsigned long mpos_fix=0; bamFile out; bam_header_t *header; header = bam_header_read(in); bam1_t *aln=bam_init1(); out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } bam_header_write(out,header); while(bam_read1(in,aln)>=0) { ++num_alns; if (aln->core.tid < 0) { // unaligned reads if ( aln->core.qual!=0 ) { //fprintf(stderr, "ERROR: Unaligned read with quality > 0 in line %lu\n",num_alns); aln->core.qual=0; unalign_mapq_fix++; } } //fprintf(stderr,"%s %c %d %d\n",bam1_qname(aln),(aln->core.tid<0?'U':'M'),aln->core.mtid,aln->core.mpos); if ( aln->core.flag & BAM_FPAIRED ) { //fprintf(stderr,"paired %d\n",(aln->core.flag & BAM_FMUNMAP)); // paired if ( aln->core.mtid <0 && !(aln->core.flag & BAM_FMUNMAP) ) { aln->core.flag |= BAM_FMUNMAP; aln->core.mpos=-1; mtid_fix++; } if ( aln->core.mpos <0 && !(aln->core.flag & BAM_FMUNMAP) ) { aln->core.flag |= BAM_FMUNMAP; aln->core.mtid=-1; mpos_fix++; } } bam_write1(out,aln); } bam_destroy1(aln); bam_close(in); bam_close(out); // fprintf(stderr,"unaligned MAPQ fixes: %lu\n",unalign_mapq_fix); fprintf(stderr,"unaligned mtid fixes: %lu\n",mtid_fix); fprintf(stderr,"unaligned mpos fixes: %lu\n",mpos_fix); return 0; }
// load a pair from a bam file SR_Status SR_BamInStreamLoadPair(SR_BamNode** ppUpAlgn, SR_BamNode** ppDownAlgn, SR_BamInStream* pBamInStream, bamFile* bam_writer_complete_bam) { khash_t(queryName)* pNameHashPrev = pBamInStream->pNameHashes[PREV_BIN]; khash_t(queryName)* pNameHashCurr = pBamInStream->pNameHashes[CURR_BIN]; int ret = 1; while(ret > 0 && (ret = SR_BamInStreamLoadNext(pBamInStream)) > 0) { // exclude those reads who are non-paired-end, qc-fail, duplicate-marked, proper-paired?!, // both aligned, secondary-alignment and no-name-specified. SR_Bool shouldBeFiltered = pBamInStream->filterFunc(pBamInStream->pNewNode, pBamInStream->filterData); if (shouldBeFiltered) { #ifdef VERBOSE_DEBUG fprintf(stderr,"%s: filtered.\n", bam1_qname(&(pBamInStream->pNewNode->alignment))); #endif if (bam_writer_complete_bam != NULL) bam_write1(*bam_writer_complete_bam, &(pBamInStream->pNewNode->alignment)); SR_BamNodeFree(pBamInStream->pNewNode, pBamInStream->pMemPool); pBamInStream->pNewNode = NULL; continue; } else { #ifdef VERBOSE_DEBUG fprintf(stderr,"%s: kept in buffer.\n", bam1_qname(&(pBamInStream->pNewNode->alignment))); #endif } // update the current ref ID or position if the incoming alignment has a // different value. The name hash and the bam array will be reset if (pNameHashPrev != NULL && (pBamInStream->pNewNode->alignment.core.tid != pBamInStream->currRefID || pBamInStream->pNewNode->alignment.core.pos >= pBamInStream->currBinPos + 2 * pBamInStream->binLen)) { if (pBamInStream->pNewNode->alignment.core.tid != pBamInStream->currRefID) { ret = SR_OUT_OF_RANGE; // different chromosome id } pBamInStream->currRefID = pBamInStream->pNewNode->alignment.core.tid; pBamInStream->currBinPos = pBamInStream->pNewNode->alignment.core.pos; // Clear the hash buffer kh_clear(queryName, pNameHashPrev); kh_clear(queryName, pNameHashCurr); // Store alignments before releasing them if (bam_writer_complete_bam != NULL) { SR_BamNode* cur = pBamInStream->pAlgnLists[PREV_BIN].first; for (int i = 0; i < pBamInStream->pAlgnLists[PREV_BIN].numNode; ++i) { // if the cur is not NULL, store the cur in the complete bam if (cur != NULL) bam_write1(*bam_writer_complete_bam, &(cur->alignment)); cur = cur->next; } // end for cur = pBamInStream->pAlgnLists[CURR_BIN].first; for (int i = 0; i < pBamInStream->pAlgnLists[CURR_BIN].numNode; ++i) { // if the cur is not NULL, store the cur in the complete bam if (cur != NULL) bam_write1(*bam_writer_complete_bam, &(cur->alignment)); cur = cur->next; } // end for } // end if SR_BamListReset(&(pBamInStream->pAlgnLists[PREV_BIN]), pBamInStream->pMemPool); SR_BamListReset(&(pBamInStream->pAlgnLists[CURR_BIN]), pBamInStream->pMemPool); } else if (pBamInStream->pNewNode->alignment.core.pos >= pBamInStream->currBinPos + pBamInStream->binLen) { pBamInStream->currBinPos += pBamInStream->binLen; kh_clear(queryName, pNameHashPrev); SR_SWAP(pNameHashPrev, pNameHashCurr, khash_t(queryName)*); // Store alignments before releasing them if (bam_writer_complete_bam != NULL) { SR_BamNode* cur = pBamInStream->pAlgnLists[PREV_BIN].first; for (int i = 0; i < pBamInStream->pAlgnLists[PREV_BIN].numNode; ++i) { // if the cur is not NULL, store the cur in the complete bam if (cur != NULL) bam_write1(*bam_writer_complete_bam, &(cur->alignment)); cur = cur->next; } } // end if SR_BamListReset(&(pBamInStream->pAlgnLists[PREV_BIN]), pBamInStream->pMemPool); SR_SWAP(pBamInStream->pAlgnLists[PREV_BIN], pBamInStream->pAlgnLists[CURR_BIN], SR_BamList); } else { } // end if-elseif-else