static int aux_fields1(void) { static const char sam[] = "data:" "@SQ\tSN:one\tLN:1000\n" "@SQ\tSN:two\tLN:500\n" "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tZZ:i:1000000\n"; // Canonical form of the alignment record above, as output by sam_format1() static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tZZ:i:1000000"; samFile *in = sam_open(sam, "r"); bam_hdr_t *header = sam_hdr_read(in); bam1_t *aln = bam_init1(); uint8_t *p; uint32_t n; kstring_t ks = { 0, 0, NULL }; if (sam_read1(in, header, aln) >= 0) { if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k') fail("XA field is '%c', expected 'k'", bam_aux2A(p)); if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37) fail("Xi field is %d, expected 37", bam_aux2i(p)); if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6) fail("Xf field is %.12f, expected pi", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6) fail("Xf field is %.12f, expected e", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0) fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO); if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0) fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF); // TODO Invent and use bam_aux2B() if ((p = check_bam_aux_get(aln, "XB", 'B')) && ! (memcmp(p, "Bc", 2) == 0 && (memcpy(&n, p+2, 4), n) == 3 && memcmp(p+6, "\xfe\x00\x02", 3) == 0)) fail("XB field is %c,..., expected c,-2,0,+2", p[1]); if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000) fail("ZZ field is %d, expected 1000000", bam_aux2i(p)); if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); free(ks.s); } else fail("can't read record"); bam_destroy1(aln); bam_hdr_destroy(header); sam_close(in); return 1; }
static int test_update_array(bam1_t *aln, const char target_id[2], uint8_t type, uint32_t nitems, void *data, const char next_id[2], int64_t next_val, char next_type) { uint8_t *p; // Try updating target if (bam_aux_update_array(aln, target_id, type, nitems, data) < 0) { fail("update %2.s tag", target_id); return -1; } // Check values p = bam_aux_get(aln, target_id); if (!p) { fail("find %.2s tag", target_id); return -1; } switch (type) { case 'c': CHECK_ARRAY_VALS(int8_t, bam_auxB2i, PRId64, PRId8); break; case 'C': CHECK_ARRAY_VALS(uint8_t, bam_auxB2i, PRId64, PRIu8); break; case 's': CHECK_ARRAY_VALS(int16_t, bam_auxB2i, PRId64, PRId16); break; case 'S': CHECK_ARRAY_VALS(uint16_t, bam_auxB2i, PRId64, PRIu16); break; case 'i': CHECK_ARRAY_VALS(int32_t, bam_auxB2i, PRId64, PRId32); break; case 'I': CHECK_ARRAY_VALS(uint32_t, bam_auxB2i, PRId64, PRIu32); break; case 'f': CHECK_ARRAY_VALS(float, bam_auxB2f, "e", "e"); break; } // If given, check that the next tag hasn't been clobbered by the // update above. if (!*next_id) return 0; p = bam_aux_get(aln, next_id); if (!p) { fail("find %.2s tag after updating %.2s", next_id, target_id); return -1; } if (*p != next_type || bam_aux2i(p) != next_val) { fail("after updating %.2s:" " %.2s field is %c:%"PRId64"; expected %c:%"PRId64, target_id, next_id, *p, bam_aux2i(p), next_type, next_val); return -1; } return 0; }
int32_t getMismatches() const { assert(m_dataPtr); uint8_t *mm = bam_aux_get(m_dataPtr.get(), "NM"); if (mm != NULL) return bam_aux2i(mm); return NO_COLOR_MM; }
int getIndelAmbiguity() const { assert(m_dataPtr); uint8_t *amb = bam_aux_get(m_dataPtr.get(), "XA"); if (amb != NULL) return bam_aux2i(amb); return INDEL_NO_AMBIGUITY; }
bool isMappedUnique() const { assert(m_dataPtr); uint8_t *hits = bam_aux_get(m_dataPtr.get(), "NH"); if (hits != NULL) return (bam_aux2i(hits) == 1); return false; }
int32_t getReportedAlignments() const { assert(m_dataPtr); uint8_t *hits = bam_aux_get(m_dataPtr.get(), "NH"); if (hits != NULL) return bam_aux2i(hits); return NO_NH; }
int32_t getMapQual() const // comes from SM tag, applies to this read if this read has a mate. { assert(m_dataPtr); //assert(shouldHaveMate()); // This will be controversial! uint8_t *mq = bam_aux_get(m_dataPtr.get(), "SM"); if (mq != NULL) return bam_aux2i(mq); return NO_MAP_QUAL; }
int32_t tmap_sam_get_fo_start_idx(tmap_sam_t *sam) { uint8_t *tag = NULL; // ZF tag = bam_aux_get(sam->b, "ZF"); if(NULL != tag) return bam_aux2i(tag); else return -1; }
int32_t tmap_sam_get_za(tmap_sam_t *sam) { uint8_t *tag = NULL; // ZA if(NULL == sam->b) tmap_bug(); tag = bam_aux_get(sam->b, "ZA"); if(NULL != tag) return bam_aux2i(tag); else return -1; }
static int test_update_int(bam1_t *aln, const char target_id[2], int64_t target_val, char expected_type, const char next_id[2], int64_t next_val, char next_type) { uint8_t *p; // Try updating target if (bam_aux_update_int(aln, target_id, target_val) < 0) { fail("update %.2s tag", target_id); return -1; } // Check it's there and has the right type and value p = bam_aux_get(aln, target_id); if (!p) { fail("find %.2s tag", target_id); return -1; } if (*p != expected_type || bam_aux2i(p) != target_val) { fail("%.2s field is %c:%"PRId64"; expected %c:%"PRId64, target_id, *p, bam_aux2i(p), expected_type, target_val); return -1; } // If given, check that the next tag hasn't been clobbered by the // update above. if (!*next_id) return 0; p = bam_aux_get(aln, next_id); if (!p) { fail("find %.2s tag after updating %.2s", next_id, target_id); return -1; } if (*p != next_type || bam_aux2i(p) != next_val) { fail("after updating %.2s to %"PRId64":" " %.2s field is %c:%"PRId64"; expected %c:%"PRId64, target_id, target_val, next_id, *p, bam_aux2i(p), next_type, next_val); return -1; } return 0; }
static int64_t get_mate_score(bam1_t *b) { uint8_t *data; int64_t score; if ((data = bam_aux_get(b, "ms"))) { score = bam_aux2i(data); } else { fprintf(stderr, "[markdup] error: no ms score tag.\n"); return -1; } return score; }
int read_bam(void *data, bam1_t *b) { aux_t *aux = (aux_t*)data; int ret; while(1) { uint8_t *tmp = 0; ret = sam_read1(aux->fp, aux->hdr, b); if (ret < 0) break; if (b->core.flag & (BAM_FUNMAP)) continue; if ((int)b->core.qual < aux->min_mapq) continue; if (bam_cigar2ulen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len) continue; tmp = bam_aux_get(b, "AS"); if (tmp && bam_aux2i(tmp) < aux->min_as) continue; break; } return ret; }
int main(int argc, char *argv[]) { short out2stdout=0; hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; int paired;//1 if not paired or pair read 1, 2 otherwise index_mem=sizeof(hashtable)*sizeof(hashnode**)*HASHSIZE*2; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam or - for stdout>\n"); return 1; } // Open file and exit if error in = bam_open(argv[1], "rb"); out2stdout = strcmp(argv[2], "-")? 0 : 1; out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); if (!out2stdout) { fprintf(stderr,"bam_fix_NH version %s\n",VERSION); fprintf(stderr,"Processing %s\n",argv[1]); fprintf(stderr,"Hashing...\n");fflush(stderr); } while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; else paired=1; ++num_alns; new_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } bam_close(in); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Hashing complete (%lu alignments)\n",num_alns); fprintf(stderr,"Memory used: %ld MB\n",index_mem/1024/1024); fprintf(stderr,"Updating entries with NH and printing BAM...\n"); fflush(stderr); } // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); num_alns=0; while(bam_read1(in2,aln)>=0) { // read alignment paired=1; if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; ++num_alns; READ_ALN *r=get_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); int32_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG // printf("!>%s %d\n",bam1_qname(aln),r->ctr); #endif } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG fprintf(stderr,"!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } bam_write1(out,aln); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } // bam_destroy1(aln); bam_close(in2); bam_close(out); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Done.\n"); } return 0; }
void signalFromBAM(const string bamFileName, const string sigFileName, Parameters P) { bam1_t *bamA; bamA=bam_init1(); double nMult=0, nUniq=0; if (P.outWigFlags.norm==1) {//count reads in the BAM file BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); bam_hdr_t *bamHeader=bam_hdr_read(bamIn); while ( true ) {//until the end of file int bamBytes1=bam_read1(bamIn, bamA); if (bamBytes1<0) break; //end of file if (bamA->core.tid<0) continue; //unmapped read // if ( !std::regex_match(chrName.at(bamA->core.tid),std::regex(P.outWigReferencesPrefix))) continue; //reference does not mathc required references if ( P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) continue; //reference does not match required references uint8_t* aNHp=bam_aux_get(bamA,"NH"); if (aNHp!=NULL) { uint32_t aNH=bam_aux2i(aNHp); if (aNH==1) {//unique mappers ++nUniq; } else if (aNH>1) { nMult+=1.0/aNH; }; }; }; bgzf_close(bamIn); }; BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); bam_hdr_t *bamHeader=bam_hdr_read(bamIn); int sigN=P.outWigFlags.strand ? 4 : 2; double *normFactor=new double[sigN]; ofstream **sigOutAll=new ofstream* [sigN]; string* sigOutFileName=new string[sigN]; sigOutFileName[0]=sigFileName+".Unique.str1.out"; sigOutFileName[1]=sigFileName+".UniqueMultiple.str1.out"; if (P.outWigFlags.strand) { sigOutFileName[2]=sigFileName+".Unique.str2.out"; sigOutFileName[3]=sigFileName+".UniqueMultiple.str2.out"; }; for (int ii=0; ii<sigN; ii++) { sigOutFileName[ii]+= (P.outWigFlags.format==0 ? ".bg" : ".wig"); sigOutAll[ii]=new ofstream ( sigOutFileName[ii].c_str() ); }; if (P.outWigFlags.norm==0) {//raw counts normFactor[0]=1; normFactor[1]=1; } else if (P.outWigFlags.norm==1) {//normlaized normFactor[0]=1.0e6 / nUniq; normFactor[1]=1.0e6 / (nUniq+nMult); for (int is=0;is<sigN;is++) {//formatting double output *sigOutAll[is]<<setiosflags(ios::fixed) << setprecision(5); }; }; if (P.outWigFlags.strand) { normFactor[2]=normFactor[0]; normFactor[3]=normFactor[1]; }; int iChr=-999; double *sigAll=NULL; uint32_t chrLen=0; while ( true ) {//until the end of file int bamBytes1=bam_read1(bamIn, bamA); if (bamA->core.tid!=iChr || bamBytes1<0) { //output to file if (iChr!=-999) {//iChr=-999 marks chromosomes that are not output, including unmapped reads for (int is=0;is<sigN;is++) { if (P.outWigFlags.format==1) { *sigOutAll[is] <<"variableStep chrom="<<bamHeader->target_name[iChr] <<"\n"; }; double prevSig=0; for (uint32_t ig=0;ig<chrLen;ig++) { double newSig=sigAll[sigN*ig+is]; if (P.outWigFlags.format==0) {//bedGraph if (newSig!=prevSig) { if (prevSig!=0) {//finish previous record *sigOutAll[is] <<ig<<"\t"<<prevSig*normFactor[is] <<"\n"; //1-based end }; if (newSig!=0) { *sigOutAll[is] << bamHeader->target_name[iChr] <<"\t"<< ig <<"\t"; //0-based beginning }; prevSig=newSig; }; } else if (P.outWigFlags.format==1){//wiggle if (newSig!=0) { *sigOutAll[is] <<ig+1<<"\t"<<newSig*normFactor[is] <<"\n"; }; }; }; }; }; if (bamBytes1<0) {//no more reads break; }; iChr=bamA->core.tid; if ( iChr==-1 || (P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) ) { iChr=-999; continue; //reference does not match required references }; chrLen=bamHeader->target_len[iChr]+1;//one extra base at the end which sohuld always be 0 delete [] sigAll; sigAll= new double[sigN*chrLen]; memset(sigAll, 0, sizeof(*sigAll)*sigN*chrLen); }; // uint32_t nCigar =(bamA->core.flag<<16)>>16; // uint32_t mapFlag=bamA->core.flag>>16; // uint32_t mapQ=(bamA->core.flag<<16)>>24; #define BAM_CIGAR_OperationShift 4 #define BAM_CIGAR_LengthBits 28 #define BAM_CIGAR_M 0 #define BAM_CIGAR_I 1 #define BAM_CIGAR_D 2 #define BAM_CIGAR_N 3 #define BAM_CIGAR_S 4 #define BAM_CIGAR_H 5 #define BAM_CIGAR_P 6 #define BAM_CIGAR_EQ 7 #define BAM_CIGAR_X 8 //by default, alignments marked as duplicate are not processed if ( (bamA->core.flag & 0x400) > 0 ) continue; //NH attribute uint8_t* aNHp=bam_aux_get(bamA,"NH"); uint32_t aNH; if (aNHp==NULL) { aNH=1; //no NH tag: assume NH=1 //continue; //do not process lines without NH field } else { aNH=bam_aux2i(bam_aux_get(bamA,"NH")); //write a safer function allowing for lacking NH tag }; if (aNH==0) continue; //do not process lines without NH=0 uint32_t aG=bamA->core.pos; uint32_t iStrand=0; if (P.outWigFlags.strand) {//strand for stranded data from SAM flag iStrand= ( (bamA->core.flag & 0x10) > 0 ) == ( (bamA->core.flag & 0x80) == 0 );//0/1 for +/- }; if (P.outWigFlags.type==1) {//5' of the1st read signal only, RAMPAGE/CAGE if ( (bamA->core.flag & 0x80)>0) continue; //skip if this the second mate if (iStrand==0) { if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci continue; //record only the first position }; }; uint32_t* cigar=(uint32_t*) (bamA->data+bamA->core.l_qname); for (uint32_t ic=0; ic<bamA->core.n_cigar; ic++) { uint32_t cigOp=(cigar[ic]<<BAM_CIGAR_LengthBits)>>BAM_CIGAR_LengthBits; uint32_t cigL=cigar[ic]>>BAM_CIGAR_OperationShift; switch (cigOp) { case(BAM_CIGAR_D): case(BAM_CIGAR_N): aG+=cigL; break; case(BAM_CIGAR_M): if (P.outWigFlags.type==0 || (P.outWigFlags.type==2 && (bamA->core.flag & 0x80)>0 )) {//full signal, or second mate onyl signal for (uint32_t ig=0;ig<cigL;ig++) { if (aG>=chrLen) { cerr << "BUG: alignment extends past chromosome in signalFromBAM.cpp\n"; exit(-1); }; if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci aG++; }; } else { aG+=cigL; }; }; }; if (P.outWigFlags.type==1) {//full signal --aG; if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci }; }; delete [] sigAll; for (int is=0; is<sigN; is++) {// flush/close all signal files sigOutAll[is]->flush(); sigOutAll[is]->close(); }; };
int main(int argc, char *argv[]) { bamFile in; sqlite3 * db; sqlite3_stmt * stmt; char * sErrMsg = NULL; char * tail = 0; int nRetCode; char sSQL [BUFFER_SIZE] = "\0"; char database[BUFFER_SIZE]; clock_t startClock,startClock2; if (argc != 2) { fprintf(stderr, "Usage: bamRindex <in.bam>\n"); return 1; } // Open file and exit if error //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); //fprintf(stderr,"Options ok\n"); in = bam_open(argv[1], "rb"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } //fprintf(stderr,"BAM opened\n"); assert(strcpy(database,argv[1])!=NULL); assert(strcat(database,".ridx")!=NULL); remove(database); // *********** // Read header bam_header_t *header; header = bam_header_read(in); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); unsigned long num_alns=0; /*********************************************/ /* Open the Database and create the Schema */ // TODO: check the errors sqlite3_open(database, &db); sqlite3_exec(db, TABLE, NULL, NULL, &sErrMsg); // create the table SQLITE_CHECK_ERROR(); startClock = clock(); sqlite3_exec(db, "PRAGMA synchronous = 0;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); sqlite3_exec(db, "PRAGMA journal_mode = OFF;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); // Use up to 8GB of memory sqlite3_exec(db, "PRAGMA cache_size = -8000000;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); sqlite3_exec(db, "BEGIN TRANSACTION;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); while(bam_read1(in,aln)>=0) { // read alignment //aln->core.tid < 0 ? uint8_t *nh = bam_aux_get(aln, "NH"); uint8_t *nm = bam_aux_get(aln, "NM"); uint8_t *xs = bam_aux_get(aln, "XS"); BOOLEAN isPrimary; BOOLEAN isMapped; BOOLEAN notMapped; BOOLEAN isDuplicate; BOOLEAN isNotPassingQualityControls; BOOLEAN isPaired; BOOLEAN isSecondMateRead,isProperPair; //secondary alignment notMapped=(aln->core.flag & BAM_FUNMAP) ? TRUE: FALSE; //notMapped=((aln->core.flag & BAM_FUNMAP) || (aln->core.mtid ==0)) ? TRUE: FALSE; isMapped=!notMapped; isPrimary= (aln->core.flag & BAM_FSECONDARY) ? FALSE:TRUE; isProperPair=(aln->core.flag & BAM_FPROPER_PAIR) ? TRUE:FALSE; isPaired=(aln->core.flag & BAM_FPAIRED ) ? TRUE:FALSE; isSecondMateRead=(aln->core.flag & BAM_FREAD2 ) ? TRUE: FALSE; isNotPassingQualityControls=(aln->core.flag & BAM_FQCFAIL ) ? TRUE:FALSE; isDuplicate=(aln->core.flag & BAM_FDUP) ? TRUE: FALSE; BOOLEAN isSpliced=FALSE; BOOLEAN hasSimpleCigar=TRUE; int nSpliced=0; int i; if (aln->core.n_cigar != 0) { for (i = 0; i < aln->core.n_cigar; ++i) { char l="MIDNSHP=X"[bam1_cigar(aln)[i]&BAM_CIGAR_MASK]; //fprintf(stderr,"%c",l); if ( l == 'N' ) { isSpliced=TRUE; hasSimpleCigar=FALSE;++nSpliced;} if ( l != 'M' && l!='=' ) { hasSimpleCigar=FALSE;} } } //fprintf(stderr,"read %ld\n",num_alns); // isDuplicate,isNotPassingQualityControls, // isSpliced,isPAired,isPrimary,hasSimpleCigar,isSecondMateRead,isProperPair,nh,nm,qual/mapq,xs sprintf(sSQL,"INSERT into bam_index values (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,'%c')", isDuplicate,isNotPassingQualityControls, nSpliced,isPaired,isPrimary,isMapped,hasSimpleCigar,isSecondMateRead,isProperPair, (nh==0?0:bam_aux2i(nh)),(nm==0?0:bam_aux2i(nm)), aln->core.qual, (xs==0?' ':(bam_aux2A(xs)==0?' ':bam_aux2A(xs)))); sqlite3_exec(db, sSQL, NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); ++num_alns; PRINT_ALNS_PROCESSED(num_alns); } bam_close(in); sqlite3_exec(db, "END TRANSACTION;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); printf("\nImported %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock))/CLOCKS_PER_SEC); // Create the indexes startClock2 = clock(); // generating the indexes does not pay off //sqlite3_exec(db, INDEXES, NULL, NULL, &sErrMsg); //printf("Indexed %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock2))/CLOCKS_PER_SEC); printf("Total time: %4.2f seconds\n", ((double)(clock() - startClock))/CLOCKS_PER_SEC); sqlite3_close(db); return 0; }
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; int32_t old_nm_i = -1, nm = 0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++u; } else { kputw(u, str); kputc(ref[x+j], str); u = 0; ++nm; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { kputw(u, str); kputc('^', str); for (j = 0; j < l; ++j) { if (x+j >= ref_len || ref[x+j] == '\0') break; kputc(ref[x+j], str); } u = 0; x += j; nm += j; if (j < l) break; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { y += l; if (op == BAM_CINS) nm += l; } else if (op == BAM_CREF_SKIP) { x += l; } } kputw(u, str); // apply max_nm if (max_nm > 0 && nm >= max_nm) { for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match seq[z/2] |= (z&1)? 0x0f : 0xf0; bam_get_qual(b)[z] = 0; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; } } // update NM if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_nm = bam_aux_get(b, "NM"); if (old_nm) old_nm_i = bam_aux2i(old_nm); if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); else if (nm != old_nm_i) { fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } } // update MD if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_md = bam_aux_get(b, "MD"); if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); else { int is_diff = 0; if (strlen((char*)old_md+1) == str->l) { for (i = 0; i < str->l; ++i) if (toupper(old_md[i+1]) != toupper(str->s[i])) break; if (i < str->l) is_diff = 1; } else is_diff = 1; if (is_diff) { fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } } } // drop all tags but RG if (flag&DROP_TAG) { uint8_t *q = bam_aux_get(b, "RG"); bam_aux_drop_other(b, q); } // reduce the resolution of base quality if (flag&BIN_QUAL) { uint8_t *qual = bam_get_qual(b); for (i = 0; i < b->core.l_qseq; ++i) if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; } free(str->s); free(str); }
static int aux_fields1(void) { static const char sam[] = "data:," "@SQ\tSN:one\tLN:1000\n" "@SQ\tSN:two\tLN:500\n" "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:d:2.46801\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n"; // Canonical form of the alignment record above, as output by sam_format1() static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" NEW_HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:f:9.8765\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\tN0:i:-1234\tN1:i:1234\tN2:i:-2\tN3:i:3\tF1:f:4.5678\tN4:B:S,65535,32768,1,0\tN5:i:4242"; samFile *in = sam_open(sam, "r"); bam_hdr_t *header = sam_hdr_read(in); bam1_t *aln = bam_init1(); uint8_t *p; kstring_t ks = { 0, 0, NULL }; int64_t b0vals[5] = { -2147483648LL,-1,0,1,2147483647LL }; // i int64_t b1vals[4] = { 0,1,2147483648LL,4294967295LL }; // I int64_t b2vals[5] = { -32768,-1,0,1,32767 }; // s int64_t b3vals[4] = { 0,1,32768,65535 }; // S int64_t b4vals[5] = { -128,-1,0,1,127 }; // c int64_t b5vals[4] = { 0,1,127,255 }; // C // NB: Floats not doubles below! // See https://randomascii.wordpress.com/2012/06/26/doubles-are-not-floats-so-dont-compare-them/ float bfvals[2] = { -3.14159f, 2.71828f }; int8_t n4v1[] = { -128, -64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64, 127 }; uint32_t n4v2[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1234, 5678, 1U << 31, 0 }; int16_t n4v3[] = { -32768, -1, 0, 1, 32767 }; float n4v4[] = { 0, 1, 2, 10, 20, 30, 1.5, -2.5 }; uint8_t n4v5[] = { 0, 255 }; int32_t n4v6[] = { -2147483647 - 1, 10, -1, 0, 1, 2147483647 }; uint16_t n4v7[] = { 65535, 32768, 1, 0 }; int32_t ival = -1234; uint32_t uval = 1234; float f1 = 4.5678; float f2 = 9.8765; size_t nvals, i; if (sam_read1(in, header, aln) >= 0) { if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k') fail("XA field is '%c', expected 'k'", bam_aux2A(p)); bam_aux_del(aln,p); if (bam_aux_get(aln,"XA")) fail("XA field was not deleted"); if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37) fail("Xi field is %"PRId64", expected 37", bam_aux2i(p)); if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6) fail("Xf field is %.12f, expected pi", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6) fail("Xf field is %.12f, expected e", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0) fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO); bam_aux_update_str(aln,"XZ",strlen(NEW_HELLO)+1,NEW_HELLO); if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), NEW_HELLO) != 0) fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), NEW_HELLO); if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0) fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF); if ((p = check_bam_aux_get(aln, "XB", 'B')) && ! (memcmp(p, "Bc", 2) == 0 && memcmp(p + 2, "\x03\x00\x00\x00\xfe\x00\x02", 7) == 0)) fail("XB field is %c,..., expected c,-2,0,+2", p[1]); check_int_B_array(aln, "B0", NELE(b0vals), b0vals); check_int_B_array(aln, "B1", NELE(b1vals), b1vals); check_int_B_array(aln, "B2", NELE(b2vals), b2vals); check_int_B_array(aln, "B3", NELE(b3vals), b3vals); check_int_B_array(aln, "B4", NELE(b4vals), b4vals); check_int_B_array(aln, "B5", NELE(b5vals), b5vals); nvals = NELE(bfvals); if ((p = check_bam_aux_get(aln, "Bf", 'B')) != NULL) { if (bam_auxB_len(p) != nvals) fail("Wrong length reported for Bf field, got %d, expected %zd\n", bam_auxB_len(p), nvals); for (i = 0; i < nvals; i++) { if (bam_auxB2f(p, i) != bfvals[i]) { fail("Wrong value from bam_auxB2f for Bf field index %zd, " "got %f expected %f\n", i, bam_auxB2f(p, i), bfvals[i]); } } } if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000) fail("ZZ field is %"PRId64", expected 1000000", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y1")) && bam_aux2i(p) != -2147483647-1) fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y2")) && bam_aux2i(p) != -2147483647) fail("Y2 field is %"PRId64", expected -2^31+1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y3")) && bam_aux2i(p) != -1) fail("Y3 field is %"PRId64", expected -1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y4")) && bam_aux2i(p) != 0) fail("Y4 field is %"PRId64", expected 0", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y5")) && bam_aux2i(p) != 1) fail("Y5 field is %"PRId64", expected 1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y6")) && bam_aux2i(p) != 2147483647) fail("Y6 field is %"PRId64", expected 2^31-1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y7")) && bam_aux2i(p) != 2147483648LL) fail("Y7 field is %"PRId64", expected 2^31", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y8")) && bam_aux2i(p) != 4294967295LL) fail("Y8 field is %"PRId64", expected 2^32-1", bam_aux2i(p)); // Try appending some new tags if (bam_aux_append(aln, "N0", 'i', sizeof(ival), (uint8_t *) &ival) != 0) fail("Failed to append N0:i tag"); if ((p = bam_aux_get(aln, "N0")) && bam_aux2i(p) != ival) fail("N0 field is %"PRId64", expected %d", bam_aux2i(p), ival); if (bam_aux_append(aln, "N1", 'I', sizeof(uval), (uint8_t *) &uval) != 0) fail("failed to append N1:I tag"); if ((p = bam_aux_get(aln, "N1")) && bam_aux2i(p) != uval) fail("N1 field is %"PRId64", expected %u", bam_aux2i(p), uval); // Append tags with bam_aux_update_int() if (bam_aux_update_int(aln, "N2", -2) < 0) fail("failed to append N2:c tag"); if (bam_aux_update_int(aln, "N3", 3) < 0) fail("failed to append N3:C tag"); p = bam_aux_get(aln, "N2"); if (!p) fail("failed to retrieve N2 tag"); else if (*p != 'c' || bam_aux2i(p) != -2) fail("N2 field is %c:%"PRId64", expected c:-2", *p, bam_aux2i(p)); p = bam_aux_get(aln, "N3"); if (!p) fail("failed to retrieve N3 tag"); else if (*p != 'C' || bam_aux2i(p) != 3) fail("N3 field is %c:%"PRId64", expected C:3", *p, bam_aux2i(p)); // Try changing values with bam_aux_update_int() i = test_update_int(aln, "N2", 2, 'C', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", 1234, 'S', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", -1, 's', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", 4294967295U, 'I', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", -2, 'i', "N3", 3, 'C'); // Append a value with bam_aux_update_float() if (bam_aux_update_float(aln, "F1", f1) < 0) fail("append F1:f tag"); p = bam_aux_get(aln, "F1"); if (!p) fail("retrieve F1 tag"); else if (*p != 'f' || bam_aux2f(p) != f1) fail("F1 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f1); // Change a double tag to a float if (bam_aux_update_float(aln, "F2", f2) < 0) fail("update F2 tag"); p = bam_aux_get(aln, "F2"); if (!p) fail("retrieve F2 tag"); else if (*p != 'f' || bam_aux2f(p) != f2) fail("F2 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f2); // Check the next one is intact too p = bam_aux_get(aln, "Y1"); if (!p) fail("retrieve Y1 tag"); else if (*p != 'i' && bam_aux2i(p) != -2147483647-1) fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p)); // bam_aux_update_array tests // append a new array i = test_update_array(aln, "N4", 'c', NELE(n4v1), n4v1, "\0\0", 0, 0); // Add a sentinal to check resizes work if (i == 0) i = test_update_int(aln, "N5", 4242, 'S', "\0\0", 0, 0); // alter the array tag a few times if (i == 0) i = test_update_array(aln, "N4", 'I', NELE(n4v2), n4v2, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 's', NELE(n4v3), n4v3, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'f', NELE(n4v4), n4v4, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'c', NELE(n4v5), n4v5, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'i', NELE(n4v6), n4v6, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'S', NELE(n4v7), n4v7, "N5", 4242, 'S'); if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); free(ks.s); } else fail("can't read record"); bam_destroy1(aln); bam_hdr_destroy(header); sam_close(in); return 1; }
// from bam_md.c in SAMtools // modified not fill in the NM tag, and not to start the reference a c->pos static void tmap_sam_md1_core(bam1_t *b, char *ref) { uint8_t *seq = bam1_seq(b); uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; uint8_t *old_md, *old_nm; int32_t old_nm_i=-1, nm=0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = x = 0; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; if (ref[x+j] == 0) break; // out of boundary if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match ++u; } else { ksprintf(str, "%d", u); kputc(ref[x+j], str); u = 0; nm++; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { ksprintf(str, "%d", u); kputc('^', str); for (j = 0; j < l; ++j) { if (ref[x+j] == 0) break; kputc(ref[x+j], str); } u = 0; if (j < l) break; x += l; nm += l; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { y += l; if (op == BAM_CINS) nm += l; } else if (op == BAM_CREF_SKIP) { x += l; } } ksprintf(str, "%d", u); // update MD old_md = bam_aux_get(b, "MD"); if(NULL == old_md) { bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } else { int is_diff = 0; if(strlen((char*)old_md+1) == str->l) { for(i = 0; i < str->l; ++i) { if(toupper(old_md[i+1]) != toupper(str->s[i])) { break; } } if(i < str->l) { is_diff = 1; } } else { is_diff = 1; } if(1 == is_diff) { bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } } // update NM old_nm = bam_aux_get(b, "NM"); if(NULL != old_nm) { old_nm_i = bam_aux2i(old_nm); if(old_nm_i != nm) { bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } } free(str->s); free(str); }
int main(int argc, char *argv[]) { hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam>\n"); return 1; } // Open file and exit if error //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); in = bam_open(argv[1], "rb"); out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); printf("Hashing...\n");flush(stdout); while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; new_read_aln(ht,bam1_qname(aln)); } bam_close(in); printf("Hashing complete (%lu alignments)\n",num_alns); printf("Memory used in the hash: %ld MB\n",index_mem/1024/1024); flush(stdout); // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); while(bam_read1(in2,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; READ_ALN *r=get_read_aln(ht,bam1_qname(aln)); //assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); uint8_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG printf("!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } // in->header // Also fix the XS:A tag // BAM_FREAD1 // BAM_FREAD2 // BAM_FREVERSE the read is mapped to the reverse strand //bam1_cigar(b) //BAM_CREF_SKIP 3 CIGAR skip on the reference (e.g. spliced alignment) //BAM_FREVERSE 16 the read is mapped to the reverse strand if (aln->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (aln->core.flag & ! BAM_FPAIRED) continue; // not paired if (aln->core.flag & ! BAM_FPROPER_PAIR) continue; // not a proper pair if (aln->core.flag & ! BAM_FMUNMAP) continue; // the mate is mapped if (aln->core.flag & BAM_FSECONDARY) continue; // secundary read if (aln->core.flag & BAM_FREAD2) continue; // only count each pair once // core.strand == 0 (f/+) 1 r/- // flag // bam1_qname(b) bam_write1(out,aln); } // bam_destroy1(aln); bam_close(in2); bam_close(out); return 0; /* uint8_t *old_nm = bam_aux_get(b, "NM"); 90 if (c->flag & BAM_FUNMAP) return; 91 if (old_nm) old_nm_i = bam_aux2i(old_nm); 92 if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 93 else if (nm != old_nm_i) { 94 fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); 95 bam_aux_del(b, old_nm); 96 bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 97 } */ }
bool ReadAlign::chimericDetection() { bool chimRecord=false; //output chains for out-of-STAR chimeric detection #ifdef OUTPUT_localChains { P->inOut->outLocalChains << readName <<"\t"<< Read0[0] <<"\t"<< Read0[1] << "\n"; for (uint iw=0; iw<nW; iw++) { for (uint itr=0;itr<nWinTr[iw];itr++) { P->inOut->outLocalChains << trAll[iw][itr]->maxScore<<"\t"<< trAll[iw][itr]->Chr<<"\t"<<trAll[iw][itr]->Str<<"\t"<<trAll[iw][itr]->nExons; for (uint ib=0;ib<trAll[iw][itr]->nExons;ib++) { P->inOut->outLocalChains <<"\t"<< trAll[iw][itr]->exons[ib][EX_G]-P->chrStart[trAll[iw][itr]->Chr] \ <<"\t"<< trAll[iw][itr]->exons[ib][EX_R] <<"\t"<< trAll[iw][itr]->exons[ib][EX_L]; }; P->inOut->outLocalChains <<"\n"; }; }; }; #endif //////////////////// chimeras //stich windows => chimeras //stich only the best window with one of the lower score ones for now - do not stich 2 lower score windows //stitch only one window on each end of the read if (P->chimSegmentMin>0 && nW>1 && trBest->rLength >= P->chimSegmentMin \ && ( trBest->exons[trBest->nExons-1][EX_R] + trBest->exons[trBest->nExons-1][EX_L] + P->chimSegmentMin <= Lread \ || trBest->exons[0][EX_R] >= P->chimSegmentMin ) \ && trBest->nextTrScore+P->outFilterMultimapScoreRange < trBest->maxScore \ && trBest->intronMotifs[0]==0 && (trBest->intronMotifs[1]==0 || trBest->intronMotifs[2]==0) ) { //there is unmapped space at the start/end, and the main window is not a multimapping window, and non non-canonical junctions, and consistend junction motif int chimScoreBest=0,chimScoreNext=0; trChim[0]=*trBest; uint roStart1=trBest->Str==0 ? trBest->exons[0][EX_R] : Lread - trBest->exons[trBest->nExons-1][EX_R] - trBest->exons[trBest->nExons-1][EX_L]; uint roEnd1=trBest->Str==0 ? trBest->exons[trBest->nExons-1][EX_R] + trBest->exons[trBest->nExons-1][EX_L] - 1 : Lread - trBest->exons[0][EX_R] - 1; if (roStart1>readLength[0]) roStart1--; if (roEnd1>readLength[0]) roEnd1--; uint chimStrBest=0; if (trBest->intronMotifs[1]==0 && trBest->intronMotifs[2]==0) {//strand is undefined chimStr=0; } else if ( (trBest->Str==0) == (trBest->intronMotifs[1]>0)) {//strand the same as RNA chimStr=1; } else {//strand opposite to RNA chimStr=2; }; for (uint iW=0; iW<nW; iW++) {//check all other windows for chimeras for (uint iWt=0; iWt<nWinTr[iW]; iWt++){//cycl over transcripts in the window if (trBest!=trAll[iW][0] && iWt>0) break; //for all windows except that of the best transcript - hceck only iWt=0 (best trnascripts) if (trBest==trAll[iW][0] && iWt==0) continue; // {//same window // if (iWt==0) continue; //do not check the best transcript itself // if (trBest->exons[0][EX_R]<=trAll[iW][iWt]->exons[0][EX_R]) { // //start of the last Best exon is before end of the first Chim exon // if (trBest->exons[trBest->nExons-1][EX_G]<trAll[iW][iWt]->exons[0][EX_G]+trAll[iW][iWt]->exons[0][EX_L]) continue; // } else { // if (trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_G]<trBest->exons[0][EX_G]+trBest->exons[0][EX_L]) continue; // }; // }; if (trAll[iW][iWt]->intronMotifs[0]>0) continue; //do not stitch a window to itself, or to a window with non-canonical junctions uint chimStr1; if (trAll[iW][iWt]->intronMotifs[1]==0 && trAll[iW][iWt]->intronMotifs[2]==0) {//strand is undefined chimStr1=0; } else if ( (trAll[iW][iWt]->Str==0) == (trAll[iW][iWt]->intronMotifs[1]>0)) {//strand the same as RNA chimStr1=1; } else {//strand opposite to RNA chimStr1=2; }; if (chimStr!=0 && chimStr1!=0 && chimStr!=chimStr1) continue; //chimeric segments have to have consitent strands uint roStart2=trAll[iW][iWt]->Str==0 ? trAll[iW][iWt]->exons[0][EX_R] : Lread - trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_R] - trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_L]; uint roEnd2=trAll[iW][iWt]->Str==0 ? trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_R] + trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_L] - 1 : Lread - trAll[iW][iWt]->exons[0][EX_R] - 1; if (roStart2>readLength[0]) roStart2--; if (roEnd2>readLength[0]) roEnd2--; uint chimOverlap = roStart2>roStart1 ? (roStart2>roEnd1 ? 0 : roEnd1-roStart2+1) : (roEnd2<roStart1 ? 0 : roEnd2-roStart1+1); bool diffMates=(roEnd1 < readLength[0] && roStart2 >= readLength[0]) || (roEnd2 < readLength[0] && roStart1 >= readLength[0]); //segment lengths && (different mates || small gap between segments) if (roEnd1 > P->chimSegmentMin + roStart1 + chimOverlap && roEnd2> P->chimSegmentMin + roStart2 + chimOverlap \ && ( diffMates || ( (roEnd1 + P->maxChimReadGap + 1) >= roStart2 && (roEnd2 + P->maxChimReadGap + 1) >= roStart1 ) ) ) { //maxChimReadGap=0 in Parameters.cpp int chimScore=trBest->maxScore + trAll[iW][iWt]->maxScore - (int)chimOverlap; //subtract overlap to avoid double counting if (chimScore > chimScoreBest && chimScore >= P->chimScoreMin && chimScore+P->chimScoreDropMax >= (int) (readLength[0]+readLength[1]) ) { trChim[1]=*trAll[iW][iWt]; chimScoreNext=chimScoreBest; chimScoreBest=chimScore; trChim[1].roStart = trChim[1].roStr ==0 ? trChim[1].rStart : Lread - trChim[1].rStart - trChim[1].rLength; trChim[1].cStart = trChim[1].gStart - P->chrStart[trChim[1].Chr]; chimStrBest=chimStr1; } else if (chimScore>chimScoreNext) {//replace the nextscore if it's not the best one and is higher than the previous one chimScoreNext=chimScore; }; }; };//cycle over window transcripts };//cyecl over windows if (chimStr==0) chimStr=chimStrBest; chimN=0; if (chimScoreNext + P->chimScoreSeparation < chimScoreBest) {//report only if chimera is unique if (trChim[0].roStart > trChim[1].roStart) swap (trChim[0],trChim[1]); uint e0 = trChim[0].Str==1 ? 0 : trChim[0].nExons-1; uint e1 = trChim[1].Str==0 ? 0 : trChim[1].nExons-1; uint chimRepeat0=0,chimRepeat1=0,chimJ0=0,chimJ1=0; int chimMotif=0; chimN=2; if ( trChim[0].exons[e0][EX_iFrag] > trChim[1].exons[e1][EX_iFrag] ) {//strange configuration, rare, similar to the next one chimN=0;//reject such chimeras //good test example: //CTTAGCTAGCAGCGTCTTCCCAGTGCCTGGAGGGCCAGTGAGAATGGCACCCTCTGGGATTTTTGCTCCTAGGTCT //TTGAGGTGAAGTTCAAAGATGTGGCTGGCTGTGAGGAGGCCGAGCTAGAGATCATGGAATTTGTGAATTTCTTGAA } else if ( trChim[0].exons[e0][EX_iFrag] < trChim[1].exons[e1][EX_iFrag] ) {//mates bracket the chimeric junction chimN=2; chimRepeat=0; chimMotif=-1; if (trChim[0].Str==1) {//negative strand chimJ0=trChim[0].exons[e0][EX_G]-1; } else { chimJ0=trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]; }; if (trChim[1].Str==0) {//positive strand chimJ1=trChim[1].exons[e1][EX_G]-1; } else { chimJ1=trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]; }; } else {//chimeric junctions is within one of the mates, check and shift chimeric junction if necessary if (trChim[0].exons[e0][EX_L]>=P->chimJunctionOverhangMin && trChim[1].exons[e1][EX_L]>=P->chimJunctionOverhangMin ) {//large enough overhang required uint roStart0 = trChim[0].Str==0 ? trChim[0].exons[e0][EX_R] : Lread - trChim[0].exons[e0][EX_R] - trChim[0].exons[e0][EX_L]; uint roStart1 = trChim[1].Str==0 ? trChim[1].exons[e1][EX_R] : Lread - trChim[1].exons[e1][EX_R] - trChim[1].exons[e1][EX_L]; uint jR, jRbest=0; int jScore=0,jMotif=0,jScoreBest=-999999,jScoreJ=0; for (jR=0; jR<roStart1+trChim[1].exons[e1][EX_L]-roStart0-1; jR++) {//scan through the exons to find a canonical junction, and check for mismatches if (jR==readLength[0]) jR++; //skip the inter-mate base char bR=Read1[0][roStart0+jR]; char b0,b1; if (trChim[0].Str==0) { b0=G[trChim[0].exons[e0][EX_G]+jR]; } else { b0=G[trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]-1-jR]; if (b0<4) b0=3-b0; }; if (trChim[1].Str==0) { b1=G[trChim[1].exons[e1][EX_G]-roStart1+roStart0+jR]; } else { b1=G[trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]-1+roStart1-roStart0-jR]; if (b1<4) b1=3-b1; }; if (b0>3 || b1>3 || bR>3) {//chimera is not called if there are Ns in the genome or in the read chimN=0; break; }; char b01,b02,b11,b12; if (trChim[0].Str==0) { b01=G[trChim[0].exons[e0][EX_G]+jR+1]; b02=G[trChim[0].exons[e0][EX_G]+jR+2]; } else { b01=G[trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]-1-jR-1]; if (b01<4) b01=3-b01; b02=G[trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]-1-jR-2]; if (b02<4) b02=3-b02; }; if (trChim[1].Str==0) { b11=G[trChim[1].exons[e1][EX_G]-roStart1+roStart0+jR-1]; b12=G[trChim[1].exons[e1][EX_G]-roStart1+roStart0+jR]; } else { b11=G[trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]-1+roStart1-roStart0-jR+1]; if (b11<4) b11=3-b11; b12=G[trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]-1+roStart1-roStart0-jR]; if (b12<4) b12=3-b12; }; jMotif=0; if (b01==2 && b02==3 && b11==0 && b12==2) {//GTAG if (chimStr!=2) { jMotif=1; }; } else if(b01==1 && b02==3 && b11==0 && b12==1) {//CTAC if (chimStr!=1) { jMotif=2; }; }; if (bR==b0 && bR!=b1) { jScore++; } else if (bR!=b0 && bR==b1) { jScore--; }; jScoreJ =jMotif==0 ? jScore + P->chimScoreJunctionNonGTAG : jScore ; if ( jScoreJ > jScoreBest || (jScoreJ == jScoreBest && jMotif>0) ) { chimMotif=jMotif; jRbest=jR; jScoreBest=jScoreJ; }; };//jR cycle if (chimN>0) {//else the chimera was rejected because of mismatches //shift junction in trChim if (trChim[0].Str==1) { trChim[0].exons[e0][EX_R] +=trChim[0].exons[e0][EX_L]-jRbest-1; trChim[0].exons[e0][EX_G] +=trChim[0].exons[e0][EX_L]-jRbest-1; trChim[0].exons[e0][EX_L]=jRbest+1; chimJ0=trChim[0].exons[e0][EX_G]-1; } else { trChim[0].exons[e0][EX_L]=jRbest+1; chimJ0=trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]; }; if (trChim[1].Str==0) { trChim[1].exons[e1][EX_R] +=roStart0+jRbest+1-roStart1; trChim[1].exons[e1][EX_G] +=roStart0+jRbest+1-roStart1; trChim[1].exons[e1][EX_L]=roStart1+trChim[1].exons[e1][EX_L]-roStart0-jRbest-1; chimJ1=trChim[1].exons[e1][EX_G]-1; } else { trChim[1].exons[e1][EX_L]=roStart1+trChim[1].exons[e1][EX_L]-roStart0-jRbest-1; chimJ1=trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]; }; //find repeats char b0,b1; uint jR; for (jR=0;jR<100;jR++) {//forward check if (trChim[0].Str==0) { b0=G[chimJ0+jR]; } else { b0=G[chimJ0-jR]; if (b0<4) b0=3-b0; }; if (trChim[1].Str==0) { b1=G[chimJ1+1+jR]; } else { b1=G[chimJ1-1-jR]; if (b1<4) b1=3-b1; }; if (b0!=b1) break; }; chimRepeat1=jR; for (jR=0;jR<100;jR++) {//reverse check if (trChim[0].Str==0) { b0=G[chimJ0-1-jR]; } else { b0=G[chimJ0+1+jR]; if (b0<4) b0=3-b0; }; if (trChim[1].Str==0) { b1=G[chimJ1-jR]; } else { b1=G[chimJ1+jR]; if (b1<4) b1=3-b1; }; if (b0!=b1) break; }; chimRepeat0=jR; };//chimN>0 };//large enough overhang };//chimeric junction is within a mate //debug // cout << readName <<"\t"<< (trChim[0].Str==0 ? chimJ1-chimJ0 : chimJ0-chimJ1) << "\t"<< (chimMotif>=0 ? P->alignIntronMax : P->alignMatesGapMax)<<"\n"; // cout << chimRepeat0 <<"\t"<<trChim[0].exons[e0][EX_L]<<"\n"; //chimeric alignments output if ( chimN==2 && trChim[0].exons[e0][EX_L]>=P->chimJunctionOverhangMin+chimRepeat0 \ && trChim[1].exons[e1][EX_L]>=P->chimJunctionOverhangMin+chimRepeat1 \ && ( trChim[0].Str!=trChim[1].Str || trChim[0].Chr!=trChim[1].Chr \ || (trChim[0].Str==0 ? chimJ1-chimJ0+1LLU : chimJ0-chimJ1+1LLU) > (chimMotif>=0 ? P->alignIntronMax : P->alignMatesGapMax) ) ) {//unique chimeras only && minOverhang1 //&& minOverhang2 //&& (diff str || diff chr || //|| gap > (alignIntronMax,alignMatesGapMax) ) negative gap = very large # because of uint chimRecord=true; //chimeric alignment was recorded //re-calculate the score for chimeric transcripts trChim[0].alignScore(Read1, G, P); trChim[1].alignScore(Read1, G, P); int chimRepresent=-999, chimType=0; if (trChim[0].exons[0][EX_iFrag]!=trChim[0].exons[trChim[0].nExons-1][EX_iFrag]) {//tr0 has both mates chimRepresent = 0; chimType = 1; trChim[0].primaryFlag=true;//paired portion is primary trChim[1].primaryFlag=false; } else if (trChim[1].exons[0][EX_iFrag]!=trChim[1].exons[trChim[1].nExons-1][EX_iFrag]) {//tr1 has both mates chimRepresent = 1; chimType = 1; trChim[1].primaryFlag=true;//paired portion is primary trChim[0].primaryFlag=false; } else if (trChim[0].exons[0][EX_iFrag]!=trChim[1].exons[0][EX_iFrag]) {//tr0 and tr1 are single different mates chimRepresent = -1; chimType = 2; trChim[0].primaryFlag=true; trChim[1].primaryFlag=true; } else {//two chimeric segments are on the same mate - this can only happen for single-end reads chimRepresent = (trChim[0].maxScore > trChim[1].maxScore) ? 0 : 1; chimType = 3; trChim[chimRepresent].primaryFlag=true; trChim[1-chimRepresent].primaryFlag=false; }; if (P->chimOutType=="WithinBAM") {//BAM output int alignType, bamN=0, bamIsuppl=-1, bamIrepr=-1; uint bamBytesTotal=0;//estimate of the total size of all bam records, for output buffering uint mateChr,mateStart; uint8_t mateStrand; for (int itr=0;itr<(int)chimN;itr++) {//generate bam for all chimeric pieces if (chimType==2) {//PE, encompassing mateChr=trChim[1-itr].Chr; mateStart=trChim[1-itr].exons[0][EX_G]; mateStrand=(uint8_t) (trChim[1-itr].Str!=trChim[1-itr].exons[0][EX_iFrag]); alignType=-1; } else {//spanning chimeric alignment, could be PE or SE mateChr=-1;mateStart=-1;mateStrand=0;//no need fot mate info unless this is the supplementary alignment if (chimRepresent==itr) { alignType=-1; //this is representative part of chimeric alignment, record is as normal; if encompassing chimeric junction, both are recorded as normal bamIrepr=( (itr%2)==(trChim[itr].Str) ) ? bamN+1 : bamN;//this is the mate that is chimerically split } else {//"supplementary" chimeric segment alignType=( (itr%2)==(trChim[itr].Str) ) ? -12 : -11; //right:left chimeric junction bamIsuppl=bamN; if (chimType==1) {//PE alignment, need mate info for the suppl uint iex=0; for (;iex<trChim[chimRepresent].nExons-1;iex++) { if (trChim[chimRepresent].exons[iex][EX_iFrag]!=trChim[itr].exons[0][EX_iFrag]) { break; }; }; mateChr=trChim[chimRepresent].Chr; mateStart=trChim[chimRepresent].exons[iex][EX_G]; mateStrand=(uint8_t) (trChim[chimRepresent].Str!=trChim[chimRepresent].exons[iex][EX_iFrag]); }; }; }; bamN+=alignBAM(trChim[itr], 1, 1, P->chrStart[trChim[itr].Chr], mateChr, mateStart, mateStrand, \ alignType, NULL, P->outSAMattrOrder, outBAMoneAlign+bamN, outBAMoneAlignNbytes+bamN); bamBytesTotal+=outBAMoneAlignNbytes[0]+outBAMoneAlignNbytes[1];//outBAMoneAlignNbytes[1] = 0 if SE is recorded }; //write all bam lines for (int ii=0; ii<bamN; ii++) {//output all pieces int tagI=-1; if (ii==bamIrepr) { tagI=bamIsuppl; } else if (ii==bamIsuppl) { tagI=bamIrepr; }; if (tagI>=0) { bam1_t *b; b=bam_init1(); bam_read1_fromArray(outBAMoneAlign[tagI], b); uint8_t* auxp=bam_aux_get(b,"NM"); uint32_t auxv=bam_aux2i(auxp); string tagSA1="SAZ"+P->chrName[b->core.tid]+','+to_string((uint)b->core.pos+1) +',' + ( (b->core.flag&0x10)==0 ? '+':'-') + \ ',' + bam_cigarString(b) + ',' + to_string((uint)b->core.qual) + ',' + to_string((uint)auxv) + ';' ; memcpy( (void*) (outBAMoneAlign[ii]+outBAMoneAlignNbytes[ii]), tagSA1.c_str(), tagSA1.size()+1);//copy string including \0 at the end outBAMoneAlignNbytes[ii]+=tagSA1.size()+1; * ( (uint32*) outBAMoneAlign[ii] ) = outBAMoneAlignNbytes[ii]-sizeof(uint32); }; if (P->outBAMunsorted) outBAMunsorted->unsortedOneAlign(outBAMoneAlign[ii], outBAMoneAlignNbytes[ii], ii>0 ? 0 : bamBytesTotal); if (P->outBAMcoord) outBAMcoord->coordOneAlign(outBAMoneAlign[ii], outBAMoneAlignNbytes[ii], (iReadAll<<32) ); }; }; for (uint iTr=0;iTr<chimN;iTr++) {//write all chimeric pieces to Chimeric.out.sam/junction if (P->readNmates==2) { outputTranscriptSAM(trChim[iTr], chimN, iTr, trChim[1-iTr].Chr, trChim[1-iTr].exons[0][EX_G], (int) (trChim[1-iTr].Str!=trChim[1-iTr].exons[0][EX_iFrag]), -1, NULL, &chunkOutChimSAM); } else { outputTranscriptSAM(trChim[iTr], chimN, iTr, -1, -1, -1, -1, NULL, &chunkOutChimSAM); }; }; //junction + SAMp chunkOutChimJunction << P->chrName[trChim[0].Chr] <<"\t"<< chimJ0 - P->chrStart[trChim[0].Chr]+1 <<"\t"<< (trChim[0].Str==0 ? "+":"-") \ <<"\t"<< P->chrName[trChim[1].Chr] <<"\t"<< chimJ1 - P->chrStart[trChim[1].Chr]+1 <<"\t"<< (trChim[1].Str==0 ? "+":"-") \ <<"\t"<< chimMotif <<"\t"<< chimRepeat0 <<"\t"<< chimRepeat1 <<"\t"<< readName+1 \ <<"\t"<< trChim[0].exons[0][EX_G] - P->chrStart[trChim[0].Chr]+1 <<"\t"<< outputTranscriptCIGARp(trChim[0]) \ <<"\t"<< trChim[1].exons[0][EX_G] - P->chrStart[trChim[1].Chr]+1 <<"\t"<< outputTranscriptCIGARp(trChim[1]) <<"\n"; //<<"\t"<< trChim[0].exons[0][EX_iFrag]+1 --- no need for that, since trChim[0] is always on the first mate }; };//chimeric score };//chimeric search return chimRecord; };//END
static inline void famstats_fm_loop(famstats_t *s, bam1_t *b, famstats_fm_settings_t *settings) { uint8_t *data; if(b->core.flag & BAM_FREAD2) return; // Silently skip all read 2s since they have the same FM values. if((b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY))) { ++s->n_flag_fail; return; } if(b->core.qual < settings->minmq) { ++s->n_mq_fail; return; } const int FM(((data = bam_aux_get(b, "FM")) != nullptr ? bam_aux2i(data) : 0)); const int NP(((data = bam_aux_get(b, "NP")) != nullptr ? bam_aux2i(data) : -1)); int RV(((data = bam_aux_get(b, "RV")) != nullptr ? bam_aux2i(data) : -1)); if(UNLIKELY(FM == 0)) LOG_EXIT("Missing required FM tag. Abort!\n"); if(FM < settings->minFM) { ++s->n_fm_fail; return; } if(bam_itag(b, "FP") == 0) { ++s->n_fp_fail; if(settings->skip_fp_fail) return; } ++s->n_pass; if(FM > 1) { ++s->realfm_counts; s->realfm_sum += FM; s->realrc_sum += RV < 0 ? 0 : RV; } ++s->allfm_counts; s->allfm_sum += FM; s->allrc_sum += RV < 0 ? 0 : RV; int khr; // Have we seen this family size before? if((s->ki = kh_get(fm, s->fm, FM)) == kh_end(s->fm)) // If not, put it into the hash table with a count of 1. s->ki = kh_put(fm, s->fm, FM, &khr), kh_val(s->fm, s->ki) = 1; else ++kh_val(s->fm, s->ki); // Otherwise increment counts // Same, but for RV if((s->ki = kh_get(fm, s->rc, RV)) == kh_end(s->rc)) s->ki = kh_put(fm, s->rc, RV, &khr), kh_val(s->rc, s->ki) = 1; else ++kh_val(s->rc, s->ki); // Same, but for NP if(NP > 0) { if((s->ki = kh_get(fm, s->np, NP)) == kh_end(s->np)) s->ki = kh_put(fm, s->np, NP, &khr), kh_val(s->np, s->ki) = 1; else ++kh_val(s->np, s->ki); } // If the Duplex Read tag is present, increment duplex read counts uint8_t *const dr_data(bam_aux_get(b, "DR")); if(dr_data && bam_aux2i(dr_data)) { if(RV < 0) RV = 0; s->dr_sum += FM; ++s->dr_counts; s->dr_rc_sum += RV; s->dr_rc_frac_sum += (double)RV / FM; } }
bam_stats_t *bam1_stats(bam1_t *bam1, bam_stats_options_t *opts) { bam_stats_t *bam_stats = NULL; uint32_t bam_flag = (uint32_t) bam1->core.flag; if (bam_flag & BAM_FUNMAP) { // not mapped, then return bam_stats = bam_stats_new(); bam_stats->mapped = 0; return bam_stats; } if (opts->region_table) { region_t region; region.chromosome = opts->sequence_labels[bam1->core.tid]; region.start_position = bam1->core.pos; region.end_position = region.start_position + bam1->core.l_qseq; region.strand = NULL; region.type = NULL; if (find_region(®ion, opts->region_table)) { bam_stats = bam_stats_new(); } else { return NULL; } } else { bam_stats = bam_stats_new(); } // mapped !! bam_stats->mapped = 1; bam_stats->strand = (int) ((bam_flag & BAM_FREVERSE) > 0); // number of errors bam_stats->num_errors = bam_aux2i(bam_aux_get(bam1, "NM")); // cigar handling: number of indels and length uint32_t cigar_int, *cigar = bam1_cigar(bam1); int num_cigar_ops = (int) bam1->core.n_cigar; for (int j = 0; j < num_cigar_ops; j++) { cigar_int = cigar[j]; switch (cigar_int & BAM_CIGAR_MASK) { case BAM_CINS: //I: insertion to the reference case BAM_CDEL: //D: deletion from the reference bam_stats->num_indels++; bam_stats->indels_length += (cigar_int >> BAM_CIGAR_SHIFT); break; } } // quality bam_stats->quality = bam1->core.qual; // unique alignment if (!(bam_flag & BAM_FSECONDARY)) { bam_stats->unique_alignment = 1; } // handling pairs bam_stats->single_end = 1; if (bam_flag & BAM_FPAIRED) { bam_stats->single_end = 0; if (bam_flag & BAM_FUNMAP) { if (bam_flag & BAM_FREAD1) { bam_stats->unmapped_pair_1 = 1; } else { bam_stats->unmapped_pair_2 = 1; } } else { if (bam_flag & BAM_FREAD1) { bam_stats->mapped_pair_1 = 1; } else { bam_stats->mapped_pair_2 = 1; } } if (!(bam_flag & BAM_FUNMAP) && !(bam_flag & BAM_FMUNMAP) && (bam_flag & BAM_FPROPER_PAIR)) { bam_stats->isize = abs(bam1->core.isize); } } // mapping length char *bam_seq = bam1_seq(bam1); int seq_len = bam1->core.l_qseq; bam_stats->seq_length = seq_len; // nucleotide content for (int i = 0; i < seq_len; i++) { switch (bam1_seqi(bam_seq, i)) { case 1: bam_stats->num_As++; break; case 2: bam_stats->num_Cs++; break; case 4: bam_stats->num_Gs++; break; case 8: bam_stats->num_Ts++; break; case 15: bam_stats->num_Ns++; break; } } bam_stats->num_GCs = bam_stats->num_Gs + bam_stats->num_Cs; return bam_stats; }
void profileReads(char* bamFile, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { // int result = -1; int supp_check = 0x0; if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // helper variables BGZF* in = 0 ; bam1_t *b = bam_init1(); bam_hdr_t *h; // open bam if ((in = bgzf_open(bamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", bamFile); } else if ((h = bam_hdr_read(in)) == 0) { // read header fprintf(stderr, "ERROR: Failed to read BAM header of file \"%s\".\n", bamFile); } else { // destroy header bam_hdr_destroy(h); int line = 0; int supplementary, secondary; int mapQual; int matches, mismatches, qLen; float pcAln, pcId; int showStats = 0; uint8_t *aux_mismatches; // print header printf("line\tsupp\tsecondary\tmapQ\tmismatches\tmatches\tqLen\tpcId\tpcAln\n"); // fetch alignments while ((result = bam_read1(in, b)) >= 0) { line += 1; // only primary mappings if ((b->core.flag & supp_check) != 0) { if (showStats) fprintf(stdout, "Rejected %d, non-primary\n", line); continue; } supplementary = (b->core.flag & (1 | BAM_FSUPPLEMENTARY)) != 0; secondary = (b->core.flag & (1 | BAM_FSECONDARY)) != 0; // quality mapQual = b->core.qual; // bam_aux_get returns 0 if optional NM tag is missing if ((aux_mismatches = bam_aux_get(b, "NM"))) mismatches = bam_aux2i(aux_mismatches); else mismatches = 0; // length qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b)); // percent identity matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b)); pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1 // percent alignment pcAln = matches / (float)qLen; // percentage as float between 0 to 1 // print read values printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\t%.4f\n", line, supplementary, secondary, mapQual, mismatches, matches, qLen, pcId, pcAln); } if (result < -1) { fprintf(stderr, "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n", line, bamFile, result); } } if (in) bgzf_close(in); bam_destroy1(b); }
static int fetch_func(const bam1_t *b, void *data) { fetch_data_t *d = (fetch_data_t*) data; char *name = bam1_qname(b); //check if name is requested here if(!d->requestedTranscripts->empty()) { //we're doing transcript filtering if(d->requestedTranscripts->find(name) == d->requestedTranscripts->end()) { //transcript wasn't requested return 0; } } fprintf(stderr,"%s\n",name); //TODO Desparately need some error checking on flag retrieval char *status = bam_aux2Z(bam_aux_get(b,"YT")); int length = bam_aux2i(bam_aux_get(b,"HI")); YTranscript* transcript; YTranscriptSubStructure structure; structure.position = b->core.pos + 1; structure.length = b->core.l_qseq; structure.ordinal = bam_aux2i(bam_aux_get(b,"HI")); if(d->transcriptNames.find(name,&transcript)) { //then we've already found some part of this transcript transcript->orderedStructures.push_back(structure); } else { transcript = new YTranscript; char *tidName = d->in->header->target_name[b->core.tid]; char *refName = new char[ strlen(tidName) + 1]; strcpy(refName, tidName); char *transcriptName = new char[ strlen(name) + 1]; strcpy(transcriptName, name); char *flagGeneName = bam_aux2Z(bam_aux_get(b,"YG")); char *geneName = new char[ strlen(flagGeneName) + 1]; strcpy(geneName, flagGeneName); char *statusName = new char[ strlen(status + 1) ]; strcpy(statusName, status); transcript->gene = geneName; transcript->name = transcriptName; transcript->refName = refName; transcript->status = statusName; transcript->orderedStructures.push_back(structure); transcript->strand = b->core.flag & BAM_FREVERSE ? -1 : 1; transcript->totalNumberOfStructures = bam_aux2i(bam_aux_get(b, "IH")); transcript->length = length; d->transcriptNames.insert(name,transcript); d->transcripts->push_back(transcript); } return 0; }
void filterReads(char * inBamFile, char * outBamFile, int minMapQual, int minLen, int maxMisMatches, float minPcId, float minPcAln, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { // int result = -1; int outResult = -1; int supp_check = 0x0; if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // helper variables BGZF* in = 0; BGZF* out = 0; bam1_t *b = bam_init1(); bam_hdr_t *h; // open bam if ((in = bgzf_open(inBamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", inBamFile); } else if ((h = bam_hdr_read(in)) == 0) { // read header fprintf(stderr, "ERROR: Failed to read BAM header of file \"%s\".\n", inBamFile); } else if ((out = bgzf_open(outBamFile, "w")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for writing.\n", outBamFile); } else { // write and destroy header bam_hdr_write(out, h); bam_hdr_destroy(h); int line = 0; int matches, mismatches, qLen; float pcAln, pcId; int showStats = 0; // fetch alignments while ((result = bam_read1(in, b)) >= 0) { line += 1; // only primary mappings if ((b->core.flag & supp_check) != 0) { if (showStats) fprintf(stdout, "Rejected %d, non-primary\n", line); continue; } // only high quality if (b->core.qual < minMapQual) { if (showStats) fprintf(stdout, "Rejected %d, quality: %d\n", line, b->core.qual); continue; } // not too many absolute mismatches mismatches = bam_aux2i(bam_aux_get(b, "NM")); if (mismatches > maxMisMatches) { if (showStats) fprintf(stdout, "Rejected %d, mismatches: %d\n", line, mismatches); continue; } // not too short qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b)); if (qLen < minLen) { if (showStats) fprintf(stdout, "Rejected %d, length: %d\n", line, qLen); continue; } // only high percent identity matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b)); pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1 if (pcId < minPcId) { if (showStats) fprintf(stdout, "Rejected %d, identity pc: %.4f\n", line, pcId); continue; } // only high percent alignment pcAln = matches / (float)qLen; // percentage as float between 0 to 1 if (pcAln < minPcAln) { if (showStats) fprintf(stdout, "Rejected %d, alignment pc: %.4f\n", line, pcAln); continue; } if ((outResult = bam_write1(out, b)) < -1) { fprintf(stderr, "ERROR: Attempt to write read no. %d to file \"%s\" failed with code %d.\n", line, outBamFile, outResult); } } if (result < -1) { fprintf(stderr, "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n", line, inBamFile, result); } } if (in) bgzf_close(in); if (out) bgzf_close(out); bam_destroy1(b); }
BM_mappedRead * extractReads(char * bamFile, char ** contigs, int numContigs, uint16_t * groups, char * prettyName, int headersOnly, int minMapQual, int maxMisMatches, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { //----- // code uses the pattern outlined in samtools view (sam_view.c) // thanks lh3! // int i = 0; int result = -1; int hh = 0; int supp_check = 0x0; // include supp mappings if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // we need to let the users know if their pairings // will be corrupted int p_corrupt = 0; // helper variables samFile *in = 0; bam_hdr_t *header = NULL; bam1_t *b = bam_init1(); BM_mappedRead * root = 0; BM_mappedRead * prev = 0; // open file handlers if ((in = sam_open(bamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", bamFile); } else { // retrieve the header if ((header = sam_hdr_read(in)) == 0) { fprintf(stderr, "ERROR: Failed to read the header from \"%s\".\n", bamFile); } else { // check the index is intact hts_idx_t *idx = sam_index_load(in, bamFile); // load index if (idx == 0) { // index is unavailable fprintf(stderr, "ERROR: Random retrieval only works "\ "for indexed files.\n"); } else { cfuhash_table_t *pair_buffer = \ cfuhash_new_with_initial_size(1000000); cfuhash_set_flag(pair_buffer, CFUHASH_FROZEN_UNTIL_GROWS); for (hh = 0; hh < numContigs; ++hh) { // parse a region in the format like `chr2:100-200' hts_itr_t *iter = sam_itr_querys(idx, header, contigs[hh]); if (iter == NULL) { // reference name is not found fprintf(stderr, "WARNING: Could not find contig: "\ "[%s] in BAM: [%s].\n", contigs[hh], bamFile); } // fetch alignments int line = 0; while ((result = sam_itr_next(in, iter, b)) >= 0) { bam1_core_t core = b->core; line += 1; // only high quality?, primary? mappings if ( core.qual < minMapQual) continue; if ((core.flag & supp_check) != 0) continue; if(bam_aux2i(bam_aux_get(b, "NM")) > maxMisMatches) { continue; } char * seqId = bam_get_qname(b); char * seq = 0; char * qual = 0; int qual_len = 0; int seq_len = 0; // get sequence and quality if(0 == headersOnly) { // no point allocating unused space seq = calloc(core.l_qseq+1, sizeof(char)); qual = calloc(core.l_qseq+1, sizeof(char)); uint8_t *s = bam_get_seq(b); if (core.flag&BAM_FREVERSE) { // reverse the read int r = 0; for (i = core.l_qseq-1; i >=0 ; --i) { seq[r]="=TGKCYSBAWRDMHVN"[bam_seqi(s, i)]; ++r; } } else { for (i = 0; i < core.l_qseq; ++i) { seq[i]="=ACMGRSVTWYHKDBN"[bam_seqi(s, i)]; } } seq_len = core.l_qseq; s = bam_get_qual(b); if (s[0] != 0xff) { qual_len = core.l_qseq; for (i = 0; i < core.l_qseq; ++i) { qual[i] = (char)(s[i] + 33); } } else if (qual != 0) { free(qual); qual = 0; } } // work out pairing information uint8_t rpi = RPI_ERROR; if (core.flag&BAM_FPAIRED) { if(core.flag&BAM_FMUNMAP) { if (core.flag&BAM_FREAD1) { rpi = RPI_SNGL_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SNGL_SEC; } } else { if (core.flag&BAM_FREAD1) { rpi = RPI_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SEC; } } } else { rpi = RPI_SNGL; } // make the funky Id #define MAX_SEQ_ID_LEN 80 char * seq_id = calloc(MAX_SEQ_ID_LEN, sizeof(char)); // allocate the string to the buffer but check to // ensure we're not cutting anything off int id_len = snprintf(seq_id, MAX_SEQ_ID_LEN, "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); if(id_len >= MAX_SEQ_ID_LEN) { seq_id = calloc(id_len+1, sizeof(char)); snprintf(seq_id, id_len+1, // don't forget the NULL! "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); } // make the mapped read struct prev = makeMappedRead(seq_id, seq, qual, id_len, seq_len, qual_len, rpi, groups[hh], prev); if (0 == root) { root = prev; } if(rpi == RPI_SNGL || \ rpi == RPI_SNGL_FIR || \ rpi == RPI_SNGL_SEC) { // we can just add away // indicate singleton reads by pointing the // partner pointer to itself prev->partnerRead = prev; } else { // RPI_FIR or RPI_SEC // work out pairing information using the hash // we append a 1 or 2 to the end so that // we don't accidentally pair 1's with 1's etc. char * stripped_result; if(rpi == RPI_FIR) { stripped_result = \ pairStripper(seqId, core.l_qname-1, '2'); } else { stripped_result = \ pairStripper(seqId, core.l_qname-1, '1'); } char * stripped = seqId; if(stripped_result) stripped = stripped_result; //fprintf(stdout, "SEARCH %s\n", stripped); // now stripped always holds a stripped value // see if it is in the hash already BM_mappedRead * stored_MR = \ cfuhash_get(pair_buffer, stripped); if (0 != stored_MR) { // exists in the hash -> Add the pair info if(rpi == RPI_FIR) { prev->partnerRead = stored_MR; } else { stored_MR->partnerRead = prev; } // delete the entry from the hash cfuhash_delete(pair_buffer, stripped); } else { // we should put it in the hash // make sure to change it into something // we will find next time if(rpi == RPI_FIR) stripped[strlen(stripped)-1] = '1'; else stripped[strlen(stripped)-1] = '2'; // check to make sure we're not overwriting // anything important. cfuhash overwrites // duplicate entries, so we need to grab // it and put it to "SNGL_XXX" before we // lose the pointer BM_mappedRead * OWMMR = \ cfuhash_put(pair_buffer, stripped, prev); if(OWMMR) { if(OWMMR->rpi == RPI_FIR) OWMMR->rpi = RPI_SNGL_FIR; else OWMMR->rpi = RPI_SNGL_SEC; OWMMR->partnerRead = OWMMR; printPairCorruptionWarning(p_corrupt); p_corrupt = 1; } } if(stripped_result != 0) { // free this! free(stripped_result); stripped_result = 0; } } } hts_itr_destroy(iter); if (result < -1) { fprintf(stderr, "ERROR: retrieval of reads from "\ "contig: \"%s\" failed due to "\ "truncated file or corrupt BAM index "\ "file\n", header->target_name[hh]); break; } } // any entries left in the hash are pairs whose mates did // not meet quality standards size_t key_size = 0; char * key; BM_mappedRead * LOMMR; size_t pr_size = 1; if(cfuhash_each_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)) { do { // get the mapped read // update it's pairing so we know it's really single if (LOMMR->rpi == RPI_FIR) LOMMR->rpi = RPI_SNGL_FIR; else if (LOMMR->rpi == RPI_SEC) LOMMR->rpi = RPI_SNGL_SEC; // indicate singleton reads by pointing the // partner pointer to itself LOMMR->partnerRead = LOMMR; } while(cfuhash_next_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)); } cfuhash_clear(pair_buffer); cfuhash_destroy(pair_buffer); } hts_idx_destroy(idx); // destroy the BAM index } } // always do this if (in) sam_close(in); bam_destroy1(b); if ( header ) bam_hdr_destroy(header); return root; }
void bam_filter(array_list_t *bam1s, array_list_t *passed_bam1s, array_list_t *failed_bam1s, bam_filter_options_t *opts) { bam1_t *bam1; uint32_t bam_flag; int value; region_t region; char **chromosomes; int num_chromosomes; if (opts->region_table) { chromosomes = opts->region_table->ordering; num_chromosomes = opts->region_table->max_chromosomes; region.strand = NULL; region.type = NULL; } size_t num_items = array_list_size(bam1s); for (size_t i = 0; i < num_items; i++) { bam1 = array_list_get(i, bam1s); bam_flag = (uint32_t) bam1->core.flag; // if not mapped, if (bam_flag & BAM_FUNMAP) { array_list_insert(bam1, failed_bam1s); continue; } // unique if (opts->unique && (bam_flag & BAM_FSECONDARY)) { array_list_insert(bam1, failed_bam1s); continue; } // proper pairs if (opts->proper_pairs && (bam_flag & BAM_FPAIRED)) { if ( !(bam_flag & BAM_FPROPER_PAIR)) { array_list_insert(bam1, failed_bam1s); continue; } } // length value = bam1->core.l_qseq; if (value < opts->min_length || value > opts->max_length) { array_list_insert(bam1, failed_bam1s); continue; } // quality value = bam1->core.qual; if (value < opts->min_quality || value > opts->max_quality) { array_list_insert(bam1, failed_bam1s); continue; } // num. error value = bam_aux2i(bam_aux_get(bam1, "NM")); if (value < opts->min_num_errors || value > opts->max_num_errors) { array_list_insert(bam1, failed_bam1s); continue; } // region if (opts->region_table) { int seq_id = bam1->core.tid; if (seq_id >=0 && seq_id < num_chromosomes) { region.chromosome = chromosomes[bam1->core.tid]; region.start_position = bam1->core.pos; region.end_position = region.start_position + bam1->core.l_qseq; if (!find_exact_region(®ion, opts->region_table)) { array_list_insert(bam1, failed_bam1s); continue; } } else { array_list_insert(bam1, failed_bam1s); continue; } } // finally, this bam1 passed all the filters, // insert it in the output list array_list_insert(bam1, passed_bam1s); } }