int main ( int argc, char *argv[] ) { struct parameters *param = 0; param = interface(param, argc, argv); //region file input (the region file should be sorted as the same way as the bam file) ifstream region_f; region_f.open(param->region_f, ios_base::in); // the region file is opened //bam input and generate index if not yet //-------------------------------------------------------------------------------------------------------+ // BAM input (file or filenames?) | //-------------------------------------------------------------------------------------------------------+ char *fof = param->mapping_f; FILE *IN=NULL; char linefof[5000]; int filecount=0; vector <string> fnames; if (strchr(fof,' ')!=NULL) { char *ptr; ptr=strtok(fof," "); while (ptr!=NULL) { fnames.push_back(ptr); filecount++; ptr=strtok(NULL," "); } } else { IN=fopen(fof,"rt"); if (IN!=NULL) { long linecount=0; while (fgets(linefof,5000-1,IN)!=NULL) { linecount++; if (linefof[0]!='#' && linefof[0]!='\n') { char *ptr=strchr(linefof,'\n'); if (ptr!=NULL && ptr[0]=='\n') { ptr[0]='\0'; } FILE *dummy=NULL; dummy=fopen(linefof,"rt"); if (dummy!=NULL) { // seems to be a file of filenames... fclose(dummy); fnames.push_back(linefof); filecount++; } else if (filecount==0 || linecount>=1000-1) { // seems to be a single file fnames.push_back(fof); filecount++; break; } } } fclose(IN); } } //file or file name decided and stored in vector "fnames" cerr << "the input mapping files are:" << endl; vector <string>::iterator fit = fnames.begin(); for(; fit != fnames.end(); fit++) { cerr << *fit << endl; } //-------------------------------------------------------------------------------------------------------+ // end of file or filenames | //-------------------------------------------------------------------------------------------------------+ // open the BAM file(s) BamMultiReader reader; reader.Open(fnames); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); if ( ! reader.LocateIndexes() ) // opens any existing index files that match our BAM files reader.CreateIndexes(); // creates index files for BAM files that still lack one // locus bias struct lb empty_profile = {0,0,0,0}; vector <struct lb> locus_b(1000, empty_profile); // output locus bias file string locus_bias_set = param->lbias; ofstream locus_bias; if ( locus_bias_set != "" ) { locus_bias.open(param->lbias); if ( !locus_bias ) { cerr << "can not open locus_bias file.\n"; exit(0); } } //should decide which chromosome string line; string old_chr = "SRP"; string type = param->type; //whether do some position-level pile-up stuff bool posc = false; ofstream posc_f; ofstream chrmap_f; string poscset = param->posc; if ( poscset != "" ) { posc = true; posc_f.open(param->posc); chrmap_f.open(param->chrmap); } bool noChr; if ( param->nochr == 1 ){ noChr = true; } else { noChr = false; } //regions for the input of region file deque <struct region> regions; getline(region_f, line); //get the first line eatline(line,regions,noChr); deque <struct region>::iterator it = regions.begin(); while ( it->chr != old_chr ) { old_chr = it->chr; // set the current chr as old chr int chr_id = reader.GetReferenceID(it->chr); if ( chr_id == -1 ) { //reference not found for (; it != regions.end() && it->chr == old_chr; ) { gene_processing(*it,locus_b); // print the old region info it = regions.erase(it); // erase the current region } while ( regions.empty() ) { getline(region_f, line); if ( region_f.eof() ){ cerr << "finished: end of region file, zone 0" << endl; break; } eatline(line, regions,noChr); it = regions.begin(); if (it->chr == old_chr){ gene_processing(*it,locus_b); regions.clear(); continue; } } continue; } int chr_len = refs.at(chr_id).RefLength; if ( !reader.SetRegion(chr_id, 1, chr_id, chr_len) ) // here set region { cerr << "bamtools count ERROR: Jump region failed " << it->chr << endl; reader.Close(); exit(1); } //pile-up pos stats set <string> fragment; map <string, unsigned int> pileup; bool isposPileup = false; unsigned int old_start = 0; unsigned int total_tags = 0; unsigned int total_pos = 0; unsigned int pileup_pos = 0; BamAlignment bam; while (reader.GetNextAlignment(bam)) { if ( bam.IsMapped() == false ) continue; // skip unaligned reads unsigned int unique; bam.GetTag("NH", unique); if (param->unique == 1) { if (unique != 1) { // skipe uniquelly mapped reads continue; } } if (read_length == 0){ read_length = bam.Length; } //cout << bam.Name << endl; string chrom = refs.at(bam.RefID).RefName; string strand = "+"; if (bam.IsReverseStrand()) strand = "-"; unsigned int alignmentStart = bam.Position+1; unsigned int mateStart; if (type == "p") mateStart = bam.MatePosition+1; unsigned int alignmentEnd = bam.GetEndPosition(); unsigned int cigarEnd; vector <int> blockLengths; vector <int> blockStarts; blockStarts.push_back(0); ParseCigar(bam.CigarData, blockStarts, blockLengths, cigarEnd); // position check for unique mapped reads (because is paired-end reads, shoule base on fragment level for paired end reads) if (posc == true && unique == 1) { if (type == "p" && fragment.count(bam.Name) > 0) fragment.erase(bam.Name); else { total_tags++; if (type == "p"){ fragment.insert(bam.Name); } string alignSum; if (type == "p") { alignSum = int2str(alignmentStart) + "\t" + int2str(mateStart) + "\t.\t" + strand; } else { alignSum = int2str(alignmentStart) + "\t" + int2str(alignmentEnd) + "\t.\t" + strand; } if ( alignmentStart != old_start ) { isposPileup = false; map <string, unsigned int>::iterator pit = pileup.begin(); for (; pit != pileup.end(); pit++) { posc_f << chrom << "\truping\tpileup\t" << pit->first << "\t.\t" << "Pileup=" << pit->second << endl; //print pileup } pileup.clear(); //clear pileup set pileup.insert( pair <string, unsigned int> (alignSum, 1) ); //insert the new read total_pos++; } else if ( alignmentStart == old_start ) { // same starts if ( pileup.count(alignSum) > 0 ) { // pileup if ( pileup[alignSum] == 1 && isposPileup == false ) { pileup_pos++; isposPileup = true; } pileup[alignSum]++; } else { pileup.insert( pair <string, unsigned int> (alignSum, 1) ); } } //same starts } //new fragment old_start = alignmentStart; } // do pos check float incre = 1.; if (blockStarts.size() > 1) incre = 0.5; // incre half for junction reads incre /= static_cast < float >(unique); // for multi aligned reads deque <struct region>::iterator iter = regions.begin(); if ( iter->start > alignmentEnd ) continue; // skip reads not overlapping with the first region while ( iter->chr == old_chr && iter->start <= alignmentEnd && iter != regions.end() ) { if (iter->end < alignmentStart) { // the region end is beyond the alignmentStart gene_processing(*iter,locus_b); // processing iter = regions.erase(iter); // this region should be removed if ( regions.empty() ) { getline(region_f, line); // get a line of region file if ( ! region_f.eof() ) { eatline(line, regions, noChr); // eat a line and put it into the duque iter = regions.begin(); } else { // it's reaching the end of the region file cerr << "finished: end of region file, zone 3" << endl; break; } } continue; } if (iter->end >= alignmentStart && iter->start <= alignmentEnd) { //overlapping, should take action vector <int>::iterator cigit = blockStarts.begin(); for (; cigit != blockStarts.end(); cigit++) { unsigned int current_start = *cigit + alignmentStart; int current_pos = current_start - (iter->start); //cout << iter->chr << "\t" << iter->start << "\t" << iter->end << "\t" << current_start << endl; if ( (iter->tags).count(current_pos) > 0 ) { (iter->tags)[current_pos] += incre; } else (iter->tags).insert( pair<int, float>(current_pos, incre) ); } } // overlapping take action! if ( (iter+1) != regions.end() ) iter++; // if this region is not the last element in the deque else { // the last element getline(region_f, line); // get a line of region file if ( ! region_f.eof() ){ eatline(line, regions, noChr); // eat a line and put it into the duque iter = regions.end(); iter--; } else { //it's reaching the end of the region file cerr << "finished: end of region file, zone 4" << endl; break; } } } //while } // read a bam // print chr map if (posc == true) { chrmap_f << old_chr << "\t" << total_tags << "\t" << total_pos << "\t" << pileup_pos << endl; } //somehow to loop back it = regions.begin(); //reset to begin for (; it != regions.end() && it->chr == old_chr; ) { gene_processing(*it,locus_b); // print the old region info it = regions.erase(it); // erase the current region } while ( regions.empty() ) { getline(region_f, line); if ( region_f.eof() ){ cerr << "finished: end of region file, zone 5" << endl; //print locus bias for (unsigned int l = 0; l < 1000; l++){ locus_bias << l << "\t" << locus_b[l].ps << "\t" << locus_b[l].hs << "\t" << locus_b[l].pe << "\t" << locus_b[l].he << endl; } exit(0); } eatline(line, regions, noChr); it = regions.begin(); if (it->chr == old_chr){ gene_processing(*it, locus_b); regions.clear(); continue; } } } // region chr != old chr regions.clear(); reader.Close(); region_f.close(); return 0; } //main
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+ "\nwhich is good if you have many reads in the bam file.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "\narguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\t"+"--1000g [vcf file] : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } if(string(argv[i]) == "--1000g"){ vcf1000g=string(argv[i+1]); i++; continue; } } unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19 bool * hasCnoT; bool * hasGnoA; bool * thousandGenomesHasA; bool * thousandGenomesHasT; cerr<<"Trying to allocating memory"<<endl; try{ hasCnoT = new bool[ maxSizeChromosome ]; hasGnoA = new bool[ maxSizeChromosome ]; thousandGenomesHasA = new bool[ maxSizeChromosome ]; thousandGenomesHasT = new bool[ maxSizeChromosome ]; }catch(bad_alloc& exc){ cerr<<"ERROR: allocating memory failed"<<endl; return 1; } cerr<<"Success in allocating memory"<<endl; for(unsigned int i = 0;i<maxSizeChromosome;i++){ hasCnoT[i]=false; hasGnoA[i]=false; thousandGenomesHasA[i]=false; thousandGenomesHasT[i]=false; } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); cerr<<"Reading consensus VCF "<<vcffiletopen<<" ... "<<endl; VCFreader vcfr (vcffiletopen, // vcffiletopen+".tbi", // chrname, // 1, // maxSizeChromosome, 0); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); if(toprint->getRef().length() != 1 ) continue; //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ hasGnoA[ toprint->getPosition() ] =true; } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ hasCnoT[ toprint->getPosition() ] =true; } } cerr<<"done reading VCF"<<endl; cerr<<"Reading 1000g VCF :"<<vcf1000g<<" ..."<<endl; string line1000g; ifstream myFile1000g; myFile1000g.open(vcf1000g.c_str(), ios::in); if (myFile1000g.is_open()){ while ( getline (myFile1000g,line1000g)){ vector<string> fields=allTokens(line1000g,'\t'); //0 chr //1 pos //2 id //3 ref //4 alt //check if same chr if(fields[0] != chrname){ cerr <<"Error, wrong chromosome in 1000g file for line= "<<line1000g<<endl; return 1; } //skip indels if(fields[3].size() != 1 || fields[4].size() != 1 ) continue; char ref=toupper(fields[3][0]); char alt=toupper(fields[4][0]); unsigned int pos=destringify<unsigned int>( fields[1] ); thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') ); thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') ); } myFile1000g.close(); }else{ cerr <<"Unable to open file "<<vcf1000g<<endl; return 1; } cerr<<"done reading 1000g VCF"<<endl; BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+1] && !thousandGenomesHasA[al.Position+1] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has a at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+2] && !thousandGenomesHasA[al.Position+2] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // // if(toprint->hasAtLeastOneG() && // // toprint->getAlt().find("A") == string::npos){ // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasGnoA[positionJump] && !thousandGenomesHasA[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } // transformRef(&refeBase,&readBase); if(hasCnoT[al.Position+1] && !thousandGenomesHasT[al.Position+1] ) isDeaminated=true; // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one C but no T // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } //transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); delete(hasCnoT); delete(hasGnoA); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, bool outtie, float meanCoverage, string outputFileHeader, map<string,int> SV_options) { size_t start = time(NULL); //open the bam file BamReader bamFile; bamFile.Open(bamFileName); //Information from the header is needed to initialize the data structure SamHeader head = bamFile.GetHeader(); // now create Translocation on the fly Window *window; window = new Window(bamFileName,outtie,meanCoverage,outputFileHeader,SV_options); window->initTrans(head); //expands a vector so that it is large enough to hold reads from each contig in separate elements window->eventReads.resize(SV_options["contigsNumber"]); window->eventSplitReads.resize(SV_options["contigsNumber"]); window-> binnedCoverage.resize(SV_options["contigsNumber"]); window-> linksFromWin.resize(SV_options["contigsNumber"]); window -> numberOfEvents = 0; string line; string coverageFile=outputFileHeader+".tab"; ifstream inputFile( coverageFile.c_str() ); int line_number=0; while (std::getline( inputFile, line )){ if(line_number > 0){ vector<string> splitline; std::stringstream ss(line); std::string item; while (std::getline(ss, item, '\t')) { splitline.push_back(item); } window -> binnedCoverage[window -> contig2position[splitline[0]]].push_back(atof(splitline[3].c_str())); } line_number += 1; } inputFile.close(); //Initialize bam entity BamAlignment currentRead; //now start to iterate over the bam file int counter = 0; while ( bamFile.GetNextAlignmentCore(currentRead) ) { if(currentRead.IsMapped()) { window->insertRead(currentRead); } } for(int i=0;i< window-> eventReads.size();i++){ if(window -> eventReads[i].size() >= window -> minimumPairs){ window->computeVariations(i); } window->eventReads[i]=queue<BamAlignment>(); window->eventSplitReads[i] = vector<BamAlignment>(); } window->interChrVariationsVCF.close(); window->intraChrVariationsVCF.close(); printf ("variant calling time consumption= %lds\n", time(NULL) - start); }
int IonstatsTestFragments(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string fasta_filename = opts.GetFirstString('r', "ref", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) { IonstatsTestFragmentsHelp(); return 1; } // // Prepare for metric calculation // map<string,string> tf_sequences; PopulateReferenceSequences(tf_sequences, fasta_filename); BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } int num_tfs = input_bam.GetReferenceCount(); SamHeader sam_header = input_bam.GetHeader(); if(!sam_header.HasReadGroups()) { fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str()); return 1; } string flow_order; string key; for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) { if(rg->HasFlowOrder()) flow_order = rg->FlowOrder; if(rg->HasKeySequence()) key = rg->KeySequence; } // Need these metrics stratified by TF. vector<ReadLengthHistogram> called_histogram(num_tfs); vector<ReadLengthHistogram> aligned_histogram(num_tfs); vector<ReadLengthHistogram> AQ10_histogram(num_tfs); vector<ReadLengthHistogram> AQ17_histogram(num_tfs); vector<SimpleHistogram> error_by_position(num_tfs); vector<MetricGeneratorSNR> system_snr(num_tfs); vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs); for (int tf = 0; tf < num_tfs; ++tf) { called_histogram[tf].Initialize(histogram_length); aligned_histogram[tf].Initialize(histogram_length); AQ10_histogram[tf].Initialize(histogram_length); AQ17_histogram[tf].Initialize(histogram_length); error_by_position[tf].Initialize(histogram_length); } vector<uint16_t> flow_signal_fz(flow_order.length()); vector<int16_t> flow_signal_zm(flow_order.length()); const RefVector& refs = input_bam.GetReferenceData(); // Missing: // - hp accuracy - tough, copy verbatim from TFMapper? BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // The check below eliminates unexpected alignments if (alignment.IsReverseStrand() or alignment.Position > 5) continue; int current_tf = alignment.RefID; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ10_bases = 0; int AQ17_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; } // // Step 3. Profit // called_histogram[current_tf].Add(alignment.Length); aligned_histogram[current_tf].Add(num_bases); AQ10_histogram[current_tf].Add(AQ10_bases); AQ17_histogram[current_tf].Add(AQ17_bases); if(alignment.GetTag("ZM", flow_signal_zm)) system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order); else if(alignment.GetTag("FZ", flow_signal_fz)) system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order); // HP accuracy - keeping it simple if (!alignment.IsReverseStrand()) { string genome = key + tf_sequences[refs[current_tf].RefName]; string calls = key + alignment.QueryBases; const char *genome_ptr = genome.c_str(); const char *calls_ptr = calls.c_str(); for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) { int genome_hp = 0; int calls_hp = 0; while (*genome_ptr == flow_order[flow]) { genome_hp++; genome_ptr++; } while (*calls_ptr == flow_order[flow]) { calls_hp++; calls_ptr++; } hp_accuracy[current_tf].Add(genome_hp, calls_hp); } } } // // Processing complete, generate ionstats_tf.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_tf"; output_json["meta"]["format_version"] = "1.0"; output_json["results_by_tf"] = Json::objectValue; for (int tf = 0; tf < num_tfs; ++tf) { if (aligned_histogram[tf].num_reads() < 1000) continue; called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]); aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]); AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]); AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]); error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]); system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName]; } input_bam.Close(); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } }
// generates mutiple sorted temp BAM files from single unsorted BAM file bool SortTool::SortToolPrivate::GenerateSortedRuns(void) { // open input BAM file BamReader reader; if ( !reader.Open(m_settings->InputBamFilename) ) { cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename << " for reading... Aborting." << endl; return false; } // get basic data that will be shared by all temp/output files SamHeader header = reader.GetHeader(); header.SortOrder = ( m_settings->IsSortingByName ? Constants::SAM_HD_SORTORDER_QUERYNAME : Constants::SAM_HD_SORTORDER_COORDINATE ); m_headerText = header.ToString(); m_references = reader.GetReferenceData(); // set up alignments buffer BamAlignment al; vector<BamAlignment> buffer; buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) ); bool bufferFull = false; // if sorting by name, we need to generate full char data // so can't use GetNextAlignmentCore() if ( m_settings->IsSortingByName ) { // iterate through file while ( reader.GetNextAlignment(al)) { // check buffer's usage bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); // store alignments until buffer is "full" if ( !bufferFull ) buffer.push_back(al); // if buffer is "full" else { // push any unmapped reads into buffer, // don't want to split these into a separate temp file if ( !al.IsMapped() ) buffer.push_back(al); // "al" is mapped, so create a sorted temp file with current buffer contents // then push "al" into fresh buffer else { CreateSortedTempFile(buffer); buffer.push_back(al); } } } } // sorting by position, can take advantage of GNACore() speedup else { // iterate through file while ( reader.GetNextAlignmentCore(al) ) { // check buffer's usage bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); // store alignments until buffer is "full" if ( !bufferFull ) buffer.push_back(al); // if buffer is "full" else { // push any unmapped reads into buffer, // don't want to split these into a separate temp file if ( !al.IsMapped() ) buffer.push_back(al); // "al" is mapped, so create a sorted temp file with current buffer contents // then push "al" into fresh buffer else { CreateSortedTempFile(buffer); buffer.push_back(al); } } } } // handle any leftover buffer contents if ( !buffer.empty() ) CreateSortedTempFile(buffer); // close reader & return success reader.Close(); return true; }
int main (int argc, char** argv) { // Print Commandline string ss(argv[0]); // convert Char to String string commandline = "##Print Command line " + ss; int c; FastaReference* reference = NULL; int minbaseQ = 10; //default int windowlen = 40; //by default string regionstr; string RegionFile; string bamfile; bool STdin = false; bool has_region = false; bool has_regionFile = false; bool has_bamfile = false; bool has_ref = false; int ploidy = 2; bool SetLowComplexityRegionSWGapExt = false; bool SetLowComplexityRegion = false; if (argc < 2) { printSummary(argv); exit(1); } while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"ploidy", required_argument, 0, 'p'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'f'}, {"min-base-quality", required_argument, 0,'q'}, {"Region", required_argument, 0, 'R'}, {"STdin", no_argument, 0, 's'}, {"bam", required_argument, 0, 'b'}, {"Repeat-Extgap", no_argument, 0, 'E'}, {"LowCompex", no_argument, 0, 'l'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hslEf:q:w:s:r:R:p:b:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 'f': reference = new FastaReference(optarg); // will exit on open failure commandline = commandline + " -f " + optarg; has_ref = true; break; case 'b': has_bamfile = true; bamfile = optarg; commandline = commandline + " -b " + optarg; break; case 'r': regionstr = optarg; has_region = true; commandline = commandline + " -r " + optarg; break; case 'R': RegionFile = optarg; has_regionFile = true; commandline = commandline + " -R " + optarg; break; case 's': STdin = true; commandline = commandline + " -s "; break; case 'q': minbaseQ = atoi(optarg); commandline = commandline + " -q " + optarg; break; case 'w': windowlen = atoi(optarg); commandline = commandline + " -w " + optarg; break; case 'p': ploidy = atoi(optarg); commandline = commandline + " -p " + optarg; break; case 'E': SetLowComplexityRegionSWGapExt = true; commandline = commandline + " -E "; break; case 'l': SetLowComplexityRegion = true; commandline = commandline + " -l "; break; case 'h': printSummary(argv); commandline = commandline + " -h "; exit(0); break; case '?': printSummary(argv); exit(1); break; default: abort(); break; } } //// Open Error log files ofstream cerrlog("bonsaiReport.txt"); streambuf *cerrsave = std::cerr.rdbuf(); // Redirect stream buffers if (cerrlog.is_open()) cerr.rdbuf(cerrlog.rdbuf()); cerr << commandline << endl; //Check for Reference Fasta sequence if (!has_ref) { cerr << "no FASTA reference provided, cannot realign" << endl; exit(1); } ////Check for reader BamReader reader; if (STdin == true) { if (!reader.Open("stdin")) { cerr << "could not open stdin bam for reading" << endl; cerr << reader.GetErrorString() << endl; reader.Close(); printSummary(argv); } } else { if (has_bamfile == true) { if (!reader.Open(bamfile)) { cerr << "ERROR: could not open bam files from stdin ... Aborting" << endl; cerr << reader.GetErrorString() << endl; reader.Close(); printSummary(argv); } if ( !reader.LocateIndex() ) reader.CreateIndex(); } else { cerr << "--bam flag is set but no bamfile is provided... Aborting" << endl; reader.Close(); printSummary(argv); } } //// Check Region Tags if ( (has_regionFile == true) && (has_region == true) ) { cerr << "ERROR: You provide both region and has provide a Set Region List... Aborting" << endl; exit(1); } //// store the names of all the reference sequences in the BAM file vector<RefData> referencedata = reader.GetReferenceData(); //// Store Region LIST vector<BamRegion> regionlist; if (has_region == true) { BamRegion region; ParseRegionString(regionstr, reader, region); regionlist.push_back(region); } else if (has_regionFile == true) { ifstream RG(RegionFile.c_str(), ios_base::in); string line; while(getline(RG,line)) { BamRegion region; ParseRegionString(line, reader, region); regionlist.push_back(region); } RG.close(); } else if ( (has_regionFile == false) && (has_region == false) ) { for (int i= 0; i < (int)referencedata.size(); i++) { string regionstr = referencedata.at(i).RefName; BamRegion region; ParseRegionString(regionstr, reader, region); if (!reader.SetRegion(region)) // Bam region will get [0,101) = 0 to 100 => [closed, half-opened) { cerr << "ERROR: set region " << regionstr << " failed. Check that REGION describes a valid range... Aborting" << endl; reader.Close(); exit(1); } else regionlist.push_back(region); } } //// BamWriter writer; if (!writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } //// Smallest start position and Largest end position for Req Seq vector<RefData>::iterator refdataIter = referencedata.begin(); vector<BamRegion>::iterator regionListIter = regionlist.begin(); // CLASS RealignFunctionsClass RealignFunction; map<int, string> RefIDRedName; vector<SalRealignInfo> AlGroups; multimap<int, BamAlignment> SortRealignedAlignmentsMultimap; int refid = 0; BamAlignment alignment; bool IsNextAlignment = reader.GetNextAlignment(alignment); //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; int windowrealigned = 0; int TotalWindowDetected = 0; int TotalReadsAligned = 0; int TotalWindow = 0; int TotalReads = 0; while (refdataIter != referencedata.end() ) { string refname = refdataIter->RefName; RefIDRedName[refid] = refname; int reflength = refdataIter->RefLength; int winstartpos, winendpos; int AllowableBasesInWindow = 1; bool nextChrName = false; cerr << "##HeaderINFO: RefID = " << refdataIter->RefName << "\t" << "RefLen = " << reflength << endl; while (nextChrName == false ) { vector<int> minmaxRefSeqPos; bool IsPassDetectorNoRealignment = false; minmaxRefSeqPos.push_back(-1); minmaxRefSeqPos.push_back(0); //cerr << " region: " << (*regionListIter).LeftRefID << " : " << (*regionListIter).LeftPosition << " .. " << (*regionListIter).RightPosition << endl; if ((refid == (int)referencedata.size() - 1) && ((*regionListIter).LeftRefID == refid) && ((has_region==true) || (has_regionFile==true)) ) { //// if ( (has_region == true) || (has_regionFile == true) ) { winstartpos = (*regionListIter).LeftPosition; winendpos = winstartpos + windowlen - 1; reflength = (*regionListIter).RightPosition; if (reflength < winendpos) reflength = winendpos; // Get Next Alignment First if ( (refid == alignment.RefID) && (winstartpos == (*regionListIter).LeftPosition) && (IsNextAlignment == false) ) IsNextAlignment = reader.GetNextAlignment(alignment); } else if (has_region == false) { winstartpos = 0; winendpos = winstartpos + windowlen - 1; // Get Next Alignment First if ( (refid == alignment.RefID) && (winstartpos == 0) && (IsNextAlignment == false) ) IsNextAlignment = reader.GetNextAlignment(alignment); } //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; //// while ((winstartpos < reflength)) { //// Check window end position if (winendpos > reflength) winendpos = reflength; // Reinitialized unsigned int NewReadMappedcount = 0; //// Save and Erase alignments that are outside of window (Deque?) if (!AlGroups.empty()) { minmaxRefSeqPos.at(0) = -1; minmaxRefSeqPos.at(1) = 0; //cerr << "#Start: Keep alignments with start position exceed the right end of the window/Region " << endl; vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); while (Iter != AlGroups.end()) { // Erase alignment s if ((*Iter).al.GetEndPosition() < winstartpos) { //cerr << " ToWrite: " << (*Iter).second.size() << " ; " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl; SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al)); AlGroups.erase(Iter); //cerr << " ToWrite: DONE " << endl; } else { string referenceSequence = reference->getSubSequence(RefIDRedName[(*Iter).al.RefID], (*Iter).al.Position, 2*(*Iter).al.Length); if ((*Iter).HasRealign == true ) { (*Iter).currentReadPosition = 0; (*Iter).currentGenomeSeqPosition = 0; (*Iter).currentAlPosition = (*Iter).al.Position; (*Iter).cigarindex = 0; } (*Iter).CigarSoftclippingLength = 0; SalRealignInfo talr = (*Iter); //cerr << " ToKEEP: " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl; RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, talr, Iter, (*Iter).al, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, false); ++Iter; //Increment iterator } } } // Write Sorted Alignments that are outside of window //cerr << "SortRealignedAlignmentsMultimap: " << SortRealignedAlignmentsMultimap.size() << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl; if (!SortRealignedAlignmentsMultimap.empty()) // && (winWrite < winstartpos ) ) { //cerr << "#Start: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl; multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); while (sraIter != SortRealignedAlignmentsMultimap.end()) { //cerr << " (*sraIter).first= " << (*sraIter).first << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << " winstartpos - ((windowlen - 1)*0.9)= " << winstartpos - ((windowlen - 1)*0.9) << endl; if (((float) (*sraIter).first < floor((float) (winstartpos - ((windowlen - 1)*0.9)))) && ((minmaxRefSeqPos.at(0) > 0) && ((*sraIter).first < minmaxRefSeqPos.at(0)))) { //writer.SaveAlignment((*sraIter).second); // Why sometimes, it doesn't work ????? if (!writer.SaveAlignment((*sraIter).second)) cerr << writer.GetErrorString() << endl; SortRealignedAlignmentsMultimap.erase(sraIter++); } else { ++sraIter; } } //cerr << "#Done: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl; } //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; //cerr << ": " << alignment.RefID << " :" << RefIDRedName[alignment.RefID] << " : " << RefIDRedName[alignment.RefID] << endl; //cerr << "Start: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl; // Gather Reads within a window frame while ((IsNextAlignment) && (refid == alignment.RefID)) // Neeed more conditions { if (SetLowComplexityRegion == true) { string sequenceInWindow = reference->getSubSequence(RefIDRedName[alignment.RefID], winstartpos, (winendpos-winstartpos+1) ); if (IsWindowInRepeatRegion(sequenceInWindow) == true) { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0) { TotalReads++; if (alignment.IsMapped()) { string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2*alignment.Length); vector<SalRealignInfo>::iterator tIter; SalRealignInfo alr; alr.al = alignment; alr.currentReadPosition = 0; alr.currentGenomeSeqPosition = 0; alr.currentAlPosition = alignment.Position; alr.cigarindex = 0; alr.HasRealign = false; alr.CigarSoftclippingLength = 0; string str = "ZZZZZZZZZZZZZZZZZ"; if (alignment.Name.find(str) != string::npos) { stringstream cigar; for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter) cigar << cigarIter->Length << cigarIter->Type; string cigarstr = cigar.str(); cerr << " TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl; } RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true); NewReadMappedcount++; } else { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); cerr << "UNmapped : " << alignment.Name << endl; } } else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); } else break; } else { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) < 2) SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); else break; } } else // (SetLowComplexityRegion == false) { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0) { TotalReads++; if (alignment.IsMapped()) { string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2 * alignment.Length); vector<SalRealignInfo>::iterator tIter; SalRealignInfo alr; alr.al = alignment; alr.currentReadPosition = 0; alr.currentGenomeSeqPosition = 0; alr.currentAlPosition = alignment.Position; alr.cigarindex = 0; alr.HasRealign = false; alr.CigarSoftclippingLength = 0; string str = "ZZZZZZZZZZZZZZZZZ"; if (alignment.Name.find(str) != string::npos) { stringstream cigar; for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter) cigar << cigarIter->Length << cigarIter->Type; string cigarstr = cigar.str(); cerr << " TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl; } RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true); //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " INDEL: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " Length: " << alignment.Length << " CIGARstr: " << cigarstr << endl; NewReadMappedcount++; } else { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); cerr << "UNmapped : " << alignment.Name << endl; } } else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); } else break; } ////Get next alignment IsNextAlignment = reader.GetNextAlignment(alignment); } //cerr << "Done: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl; //// Detector Corner bool ToRealign = MeetIndelDetectorThresholdv(AlGroups); cerr << "MeetIndelDetectorThresholdv(AlGroups).size()= " << AlGroups.size() << endl; // ************** if (ToRealign) { //cerr << " ToRealign: " << refdataIter->RefName << "\t" << reflength << "\t" << winstartpos << "\t" << winendpos << "\t" << AlGroups.size() << endl; //cerr << " minmaxRefSeqPos.at(1)= " << minmaxRefSeqPos.at(1) << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl; ////// Perform Realign routines int TotalAlR = 0; // Total number of alignments to be realigned int NumAlR = 0; // Now many alignments are aligned TotalWindowDetected++; cerr << "#Start: Meet Threshold, Realigning ... " << endl; if (minmaxRefSeqPos.at(1) < winendpos) minmaxRefSeqPos.at(1) = winendpos; if (minmaxRefSeqPos.at(0) > winstartpos) minmaxRefSeqPos.at(0) = winstartpos; bool IsToRealign = RealignFunction.PruningByNaiveSelectionProcedureAndConstructHaplotypes2(winstartpos, winendpos, refid, refname, minmaxRefSeqPos, reference); if (IsToRealign == true) { RealignFunction.SelectHaplotypeCandidates_SmithWatermanBSv(AlGroups, minmaxRefSeqPos, SetLowComplexityRegionSWGapExt); minmaxRefSeqPos.at(0) = -1; minmaxRefSeqPos.at(1) = 0; int nextwinstartpos = winendpos + 1; int nextwinendpos = winstartpos + windowlen - 1; if (nextwinendpos > reflength) nextwinendpos = reflength; //cerr << " Before Realign : " << SortRealignedAlignmentsMultimap.size() << endl; RealignFunction.AdjustCigarsWRTChosenMultipleHaplotypesAndPrepareAlignmentsTobeWrittenOut(AlGroups, SortRealignedAlignmentsMultimap, reference, RefIDRedName, minmaxRefSeqPos, nextwinstartpos, nextwinendpos, minbaseQ, TotalAlR, NumAlR, ploidy); IsPassDetectorNoRealignment = false; // Set flag to false to deactivate write functions //cerr << " After Realign : " << SortRealignedAlignmentsMultimap.size() << endl; TotalReadsAligned += NumAlR; if (NumAlR > 0) // Realignment done windowrealigned++; } else cerr << "#Done: Meet Threshold, Realigning ... " << endl; } if (NewReadMappedcount > 0) TotalWindow++; RealignFunction.Clear(); //// Move the window frame winstartpos = winendpos + 1; winendpos = winstartpos + windowlen - 1; } //// Save and Erase remaining alignments that are outside of window (Deque?) if ((!AlGroups.empty())) { cerr << "#Start: Write Remaining alignments and delete all alignments" << endl; for (vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); Iter != AlGroups.end(); ++Iter) { //cerr << " Remain alignment start: " << (*Iter).al.Name << " " << Iter->al.Position << " < " << winstartpos << " " << winendpos << endl; SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al)); } cerr << "#Done: Write Remaining alignments and delete all alignments" << endl; } AlGroups.clear(); // Write Sorted remaining Alignments that are outside of window if (!SortRealignedAlignmentsMultimap.empty()) { for (multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); sraIter != SortRealignedAlignmentsMultimap.end(); ++sraIter) { //writer.SaveAlignment((*sraIter).second); if (!writer.SaveAlignment((*sraIter).second)) cerr << writer.GetErrorString() << endl; } SortRealignedAlignmentsMultimap.clear(); } } ++regionListIter; if ((*regionListIter).LeftRefID > refid) nextChrName = true; } //// If End of the chromosome position //// increament iterator ++refdataIter; ++refid; } reader.Close(); writer.Close(); cerr << "##-Completed- " << endl; cerr << " Total Reads processed = " << TotalReads << endl; cerr << " Total Reads Aligned = " << TotalReadsAligned << endl; cerr << " Total Window processed = " << TotalWindow << endl; cerr << " Total Window Detected = " << TotalWindowDetected << endl; cerr << " Total Windows Aligned = " << windowrealigned << endl; // Restore cerr's stream buffer before terminating if (cerrlog.is_open()) cerr.rdbuf(cerrsave); commandline.clear(); return 0; }
int main (int argc, char *argv[]) { if( (argc!= 4 && argc !=5 && argc !=6) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cerr<<"Usage:splitByRG [in bam] [rg Tally] [out prefix] (optional target)"<<endl<<"this program will subsample a BAM file per read group for a certain target\nFor example splitByRG in.bam tally.txt out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl; return 1; } string bamfiletopen = string(argv[1]); string rgTally = string(argv[2]); string bamDirOutPrefix = string(argv[3]); int target = 200000; int maxTarget = 1000000; if(argc==5){ target = destringify<int> ( string(argv[4]) ); } if(argc==6){ target = destringify<int> ( string(argv[4]) ); maxTarget = destringify<int> ( string(argv[5]) ); } cerr<<"minimum fragments:\t"<<target<<endl; cerr<<"target fragments:\t"<<maxTarget<<endl; string line; ifstream myFileTally; map<string,double> rg2Fraction; myFileTally.open(rgTally.c_str(), ios::in); cerr<<"Retained groups:\n"<<endl; cerr<<"RG\t#mapped\tfraction retained"<<endl; cerr<<"-----------------------------------"<<endl; if (myFileTally.is_open()){ while ( getline (myFileTally,line)){ vector<string> tokens = allTokens(line,'\t'); if(tokens.size() > 6) if( tokens[1] == "pass" && (tokens[0] != "\"\"" && tokens[0] != "control" && tokens[0] != "TOTAL") ){ //cout<<tokens[0]<<"\t"<<tokens[5]<<endl; int count = destringify<int>(tokens[5]); if(count>target){ if(count>=maxTarget){ rg2Fraction[ tokens[0] ] = double(maxTarget)/double(count); cout<<tokens[0]<<"\t"<<count<<"\t"<<double(maxTarget)/double(count)<<endl; }else{ cout<<tokens[0]<<"\t"<<count<<"\t"<<1.0<<endl; rg2Fraction[ tokens[0] ] = 1.0; } } } } myFileTally.close(); }else{ cerr << "Unable to open file "<<rgTally<<endl; return 1; } map<string,BamWriter *> rg2BamWriter; // if(!isDirectory(bamDirOut)){ // cerr<<"ERROR: the out directory does not exist"<<endl; // return 1; // } BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); vector<RefData> refData=reader.GetReferenceData(); SamReadGroupDictionary srgd=header.ReadGroups; for(SamReadGroupConstIterator srgci=srgd.ConstBegin(); srgci<srgd.ConstEnd(); srgci++){ //cout<<*srgci<<endl; const SamReadGroup rg = (*srgci); //cout<<rg.ID<<endl; if( rg2Fraction.find(rg.ID) != rg2Fraction.end() ){ rg2BamWriter[rg.ID] = new BamWriter(); rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); } //cout<<bamDirOutPrefix+"."+rg.ID+".bam"<<endl; } // return 1; // BamWriter unmapped; // cout<<header.ToString()<<endl; // return 1; // if ( !unmapped.Open(bamDirOutPrefix+".unmapped.bam",header,references) ) { // cerr << "Could not open output BAM file "<< bamDirOutPrefix+".unmapped.bam" << endl; // return 1; // } // cout<<"reading"<<endl; BamAlignment al; unsigned int total=0; while ( reader.GetNextAlignment(al) ) { if(al.HasTag("RG") && al.IsMapped() ){ string rgTag; al.GetTag("RG",rgTag); //cout<<rgTag<<endl; if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new: ignore completely }else{ if( randomProb() <= rg2Fraction[ rgTag ] ){ rg2BamWriter[rgTag]->SaveAlignment(al); //cout<<"wrote "<<rgTag<<endl; } else{ //cout<<"skipped "<<rgTag<<endl; } } }// else{ // string rgTag="unknown"; // //cout<<rgTag<<endl; // if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new // cerr<<"Found new RG "<<rgTag<<endl; // rg2BamWriter[rgTag] = new BamWriter(); // if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) { // cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl; // return 1; // } // rg2BamWriter[rgTag]->SaveAlignment(al); // }else{ // rg2BamWriter[rgTag]->SaveAlignment(al); // } // // cerr << "Cannot get RG tag for " << al.Name<<endl; // // return 1; // } total++; } //while al reader.Close(); // writer.Close(); // unmapped.Close(); map<string,BamWriter *>::iterator rg2BamWriterIt; for (rg2BamWriterIt =rg2BamWriter.begin(); rg2BamWriterIt!=rg2BamWriter.end(); rg2BamWriterIt++){ rg2BamWriterIt->second->Close(); } cerr<<"Wrote succesfully "<<total<<" reads"<<endl; return 0; }
void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB = new BedFile(_bedBFile); _bedB->loadBedFileIntoMap(); // create a dummy BED A file for printing purposes if not // using BAM output. if (_bamOutput == false) { _bedA = new BedFile(_bedAFile); _bedA->bedType = 12; } // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // reserve some space hits.reserve(100); BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { // save an unaligned read if -v if (!bam.IsMapped()) { if (_noHit == true) writer.SaveAlignment(bam); continue; } // break alignment into discrete blocks, bedVector bed_blocks; string chrom = refs.at(bam.RefID).RefName; GetBamBlocks(bam, chrom, bed_blocks, false, true); // create a basic BED entry from the BAM alignment BED bed; MakeBedFromBam(bam, chrom, bed_blocks, bed); bool overlapsFound = false; if ((_bamOutput == true) && (_obeySplits == false)) { overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, bed.strand, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); } else if ( ((_bamOutput == true) && (_obeySplits == true)) || ((_bamOutput == false) && (_obeySplits == true)) ) { // find the hits that overlap with the full span of the blocked BED _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand, hits, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); // find the overlaps between the block in A and B overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput); } else if ((_bamOutput == false) && (_obeySplits == false)) { FindOverlaps(bed, hits); } // save the BAM alignment if overlap reqs. were met if (_bamOutput == true) { if ((overlapsFound == true) && (_noHit == false)) writer.SaveAlignment(bam); else if ((overlapsFound == false) && (_noHit == true)) writer.SaveAlignment(bam); } hits.clear(); } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // open our BAM writer writer.Open("stdout", header, refs, _isUncompressedBam); } vector<BED> hits; // reserve some space hits.reserve(100); _bedA->bedType = 6; BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { bool overlapsFound = false; // treat the BAM alignment as a single "block" if (_obeySplits == false) { overlapsFound = FindOneOrMoreOverlap(a); } // split the BAM alignment into discrete blocks and // look for overlaps only within each block. else { bool overlapFoundForBlock; bedVector bedBlocks; // vec to store the discrete BED "blocks" from a // we don't want to split on "D" ops, hence the "false" getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { overlapFoundForBlock = FindOneOrMoreOverlap(a); if (overlapFoundForBlock == true) overlapsFound = true; } } if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) { writer.SaveAlignment(bam); } } } else { // treat the BAM alignment as a single BED "block" if (_obeySplits == false) { FindOverlaps(a, hits); hits.clear(); } // split the BAM alignment into discrete BED blocks and // look for overlaps only within each block. else { bedVector bedBlocks; // vec to store the discrete BED "blocks" from a getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { FindOverlaps(*bedItr, hits); hits.clear(); } } } } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
void BedGenomeCoverage::CoverageBam(string bamFile) { ResetChromCoverage(); // open the BAM file BamReader reader; if (!reader.Open(bamFile)) { cerr << "Failed to open BAM file " << bamFile << endl; exit(1); } // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // load the BAM header references into a BEDTools "genome file" _genome = new GenomeFile(refs); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { // skip if the read is unaligned if (bam.IsMapped() == false) continue; bool _isReverseStrand = bam.IsReverseStrand(); //changing second mate's strand to opposite if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate()) _isReverseStrand = !bam.IsReverseStrand(); // skip if we care about strands and the strand isn't what // the user wanted if ( (_filterByStrand == true) && ((_requestedStrand == "-") != _isReverseStrand) ) continue; // extract the chrom, start and end from the BAM alignment string chrom(refs.at(bam.RefID).RefName); CHRPOS start = bam.Position; CHRPOS end = bam.GetEndPosition(false, false) - 1; // are we on a new chromosome? if ( chrom != _currChromName ) StartNewChrom(chrom); if(_pair_chip_) { // Skip if not a proper pair if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) ) continue; // Skip if wrong coordinates if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) || ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) { //chemically designed: left on positive strand, right on reverse one continue; } /*if(_haveSize) { if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = bam.MatePosition+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = start+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } } else */ if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left AddCoverage(bam.MatePosition, end); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right AddCoverage(start, start + abs(bam.InsertSize) - 1); } } else if (_haveSize) { if(bam.IsReverseStrand()) { if(end<_fragmentSize) { //sometimes fragmentSize is bigger :( AddCoverage(0, end); } else { AddCoverage(end + 1 - _fragmentSize, end ); } } else { AddCoverage(start,start+_fragmentSize - 1); } } else // add coverage accordingly. if (!_only_5p_end && !_only_3p_end) { bedVector bedBlocks; // we always want to split blocks when a D CIGAR op is found. // if the user invokes -split, we want to also split on N ops. if (_obeySplits) { // "D" true, "N" true GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true); } else { // "D" true, "N" false GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false); } AddBlockedCoverage(bedBlocks); } else if (_only_5p_end) { CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } else if (_only_3p_end) { CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } } // close the BAM reader.Close(); // process the results of the last chromosome. ReportChromCoverage(_currChromCoverage, _currChromSize, _currChromName, _currChromDepthHist); // report all empty chromsomes PrintEmptyChromosomes(); // report the overall coverage if asked. PrintFinalCoverage(); }
/* Input file format: BAM file */ void Distinguish( string mappingFile, map< string, bool > & hapSNP, map< string, string > & refSeq, string outfilePrefix ) { // [WARNING] If refSeq is not empty than it's BS data, otherwirse is resequencing data. string outfile1 = outfilePrefix + ".1.bam"; string outfile2 = outfilePrefix + ".2.bam"; string outHomoF = outfilePrefix + ".h**o.bam"; string outUdeci = outfilePrefix + ".ambiguity.bam"; // The ambiguity reads or the reads which has't any SNP. BamReader h_I; // bam input file handle if ( !h_I.Open( mappingFile ) ) cerr << "[ERROR]: " << h_I.GetErrorString() << endl; // "header" and "references" from BAM files, these are required by BamWriter const SamHeader header = h_I.GetHeader(); const RefVector references = h_I.GetReferenceData(); BamWriter h_O1, h_O2, h_U, h_H; if ( !h_O1.Open( outfile1, header, references ) ) { cerr << "Cannot open output BAM file: " << outfile1 << endl; exit(1); } if ( !h_O2.Open( outfile2, header, references ) ) { cerr << "Cannot open output BAM file: " << outfile2 << endl; exit(1); } if ( !h_U.Open ( outUdeci, header, references ) ) { cerr << "Cannot open output BAM file: " << outUdeci << endl; exit(1); } if ( !h_H.Open ( outHomoF, header, references ) ) { cerr << "Cannot open output BAM file: " << outHomoF << endl; exit(1); } int readsNumberRecord(0); SamLine samline; // Samline class SamExt sam; BamAlignment al; map< string, pair<BamAlignment, SamExt> > firstMateAl; // record the first mate reads alignment, HIstory problem to be like this struct!! string refstr; // Just For BS data. bool isC2T ( false ); // Just For BS data. while ( h_I.GetNextAlignment( al ) ) { ++readsNumberRecord; if ( readsNumberRecord % 1000000 == 0 ) cerr << "Have been dealed " << readsNumberRecord << " lines. " << local_time (); if ( !al.IsMapped() ) continue; //if ( al.InsertSize == 0 || al.RefID != al.MateRefID ) continue; samline._RID = al.Name; samline._Flag= al.AlignmentFlag; samline._ref_id = h_I.GetReferenceData()[al.RefID].RefName; samline._position = al.Position + 1; // Position (0-base starts in BamTools), but I need 1-base starts samline._mapQ = al.MapQuality; // MateRefID == -1 means mate read is unmapping samline._XorD = ( al.MateRefID > -1 ) ? h_I.GetReferenceData()[al.MateRefID].RefName : "*"; samline._coor = al.MatePosition + 1; // Position (0-base starts in BamTools), but I need 1-base starts samline._seq = al.QueryBases; samline._insert_size = abs (al.InsertSize); if ( samline._ref_id.compare( "BIG_ID_CAT" ) == 0 ) continue; // Ignore "BIG_ID_CAT" // get cigar; samline._cigar = itoa(al.CigarData[0].Length); samline._cigar.append( 1, al.CigarData[0].Type ); for ( size_t i(1); i < al.CigarData.size(); ++i ) { samline._cigar += itoa(al.CigarData[i].Length); samline._cigar.append( 1, al.CigarData[i].Type ); } sam.assign( &samline ); /*********************************** For BS Data *********************************************/ if ( !refSeq.empty() ) { // If the data is BS data, we should modify the QueryBases. if ( !refSeq.count( samline._ref_id ) ) { cerr << "[ERROR]There's no such reference in the reference file. " << samline._ref_id << endl; exit(1); } if ( al.IsFirstMate() && !al.IsReverseStrand() ) { isC2T = true; } else if ( al.IsFirstMate() && al.IsReverseStrand() ) { isC2T = false; } else if ( al.IsSecondMate() && !al.IsReverseStrand() ) { isC2T = false; } else if ( al.IsSecondMate() && al.IsReverseStrand() ) { isC2T = true; } else { cerr << "[ERROR MATCH] " << endl; exit(1); } refstr.assign( refSeq[samline._ref_id], sam.ref_start() - 1, sam.ref_end() - sam.ref_start() + 1 ); modifyBSreadBases( samline._ref_id, sam.ref_start (), sam.read_start(), sam.cigar_seq(), refstr, sam._seq, hapSNP, isC2T ); } /********************************** End For BS Data *******************************************/ // Consider the mate pair reads if ( !firstMateAl.count(al.Name) && (al.MateRefID > -1) ) { firstMateAl[al.Name] = std::make_pair( al, sam ); } else { // Consider the mate pair reads if ( !firstMateAl.count(al.Name) ) { switch ( Decide( sam, hapSNP ) ) { case 1 : h_O1.SaveAlignment( al ); break; // Hap1 case 2 : h_O2.SaveAlignment( al ); break; // Hap2 case 0 : h_U.SaveAlignment ( al ); break; // Ambiguity default: // This alignment didn't contain any hete SNP. h_H.SaveAlignment ( al ); // Homozygous reads } } else { int mark1 = Decide( firstMateAl[al.Name].second, hapSNP ); int mark2 = Decide( sam, hapSNP ); if ( mark1 == 1 && mark2 == 1 ) { h_O1.SaveAlignment( firstMateAl[al.Name].first ); h_O1.SaveAlignment( al ); } else if ( (mark1 == 1 && mark2 == 0) || (mark1 == 0 && mark2 == 1) ) { h_O1.SaveAlignment( firstMateAl[al.Name].first ); h_O1.SaveAlignment( al ); } else if ( (mark1 == 1 && mark2 == -1) || (mark1 == -1 && mark2 == 1) ) { h_O1.SaveAlignment( firstMateAl[al.Name].first ); h_O1.SaveAlignment( al ); } else if ( mark1 == 2 && mark2 == 2 ) { h_O2.SaveAlignment( firstMateAl[al.Name].first ); h_O2.SaveAlignment( al ); } else if ( (mark1 == 2 && mark2 == 0 ) || (mark1 == 0 && mark2 == 2) ) { h_O2.SaveAlignment( firstMateAl[al.Name].first ); h_O2.SaveAlignment( al ); } else if ( (mark1 == 2 && mark2 == -1) || (mark1 == -1 && mark2 == 2) ) { h_O2.SaveAlignment( firstMateAl[al.Name].first ); h_O2.SaveAlignment( al ); } else if ( mark1 == -1 && mark2 == -1 ) { h_H.SaveAlignment ( firstMateAl[al.Name].first ); h_H.SaveAlignment ( al ); } else { h_U.SaveAlignment ( firstMateAl[al.Name].first ); h_U.SaveAlignment ( al ); } firstMateAl.erase( al.Name ); } } } cerr << "------------ Remaind size: " << firstMateAl.size() << endl; for (map< string, pair<BamAlignment, SamExt> >::iterator it( firstMateAl.begin() ); it != firstMateAl.end(); ++it ) { switch ( Decide( it->second.second, hapSNP ) ) { case 1 : h_O1.SaveAlignment( it->second.first ); break; // Hap1 case 2 : h_O2.SaveAlignment( it->second.first ); break; // Hap2 case 0 : h_U.SaveAlignment ( it->second.first ); break; // Ambiguity default: // This alignment didn't contain any hete SNP. h_H.SaveAlignment ( it->second.first ); // Homozygous reads } } h_I.Close(); h_U.Close(); h_H.Close(); h_O1.Close(); h_O2.Close(); cerr << ">>>>>>>>>>>>> All Done <<<<<<<<<<<<<<" << endl; cerr << "Write to output file: " << outfile1 << endl; cerr << "Write to output file: " << outfile2 << endl; cerr << "Write to output file: " << outHomoF << endl; cerr << "Write to output file: " << outUdeci << endl; return; }
void setMateInfo( BamAlignment & rec1, BamAlignment & rec2, SamHeader & header) { const int NO_ALIGNMENT_REFERENCE_INDEX = -1; const int NO_ALIGNMENT_START = -1; // If neither read is unmapped just set their mate info if (rec1.IsMapped() && rec2.IsMapped()) { rec1.MateRefID = rec2.MateRefID; rec1.MatePosition = rec2.Position; rec1.SetIsReverseStrand(rec2.IsReverseStrand()); rec1.SetIsMapped(true); rec1.AddTag("MQ", "i", rec2.MapQuality); rec2.MateRefID = rec1.RefID; rec2.MatePosition = rec1.Position; rec2.SetIsReverseStrand( rec1.IsReverseStrand() ); rec2.SetIsMapped(true); rec2.AddTag("MQ", "i", rec1.MapQuality); } // Else if they're both unmapped set that straight else if (!rec1.IsMapped() && !rec2.IsMapped()) { rec1.RefID = NO_ALIGNMENT_REFERENCE_INDEX; rec1.Position = NO_ALIGNMENT_START; rec1.MateRefID = NO_ALIGNMENT_REFERENCE_INDEX; rec1.MatePosition = NO_ALIGNMENT_START; rec1.SetIsReverseStrand(rec2.IsReverseStrand()); rec1.SetIsMapped(false); rec2.RemoveTag("MQ"); rec1.Length = 0; rec2.RefID = NO_ALIGNMENT_REFERENCE_INDEX; rec2.Position = NO_ALIGNMENT_START; rec2.MateRefID = NO_ALIGNMENT_REFERENCE_INDEX; rec2.MatePosition = NO_ALIGNMENT_START; rec2.SetIsReverseStrand(rec1.IsReverseStrand()); rec2.SetIsMapped(false); rec2.RemoveTag("MQ"); rec2.Length = 0; } // And if only one is mapped copy it's coordinate information to the mate else { BamAlignment & mapped = rec1.IsMapped() ? rec1 : rec2; BamAlignment & unmapped = rec1.IsMapped() ? rec2 : rec1; unmapped.RefID = mapped.RefID; unmapped.Position = mapped.Position; mapped.MateRefID = unmapped.RefID; mapped.MatePosition = unmapped.Position; mapped.SetIsMateReverseStrand(unmapped.IsReverseStrand()); mapped.SetIsMateMapped(false); mapped.Length = 0; unmapped.MateRefID = mapped.RefID; unmapped.MatePosition = mapped.Position; unmapped.SetIsMateReverseStrand(mapped.IsReverseStrand()); unmapped.SetIsMateMapped(true); unmapped.Length = 0; } const int insertSize = computeInsertSize(rec1, rec2); rec1.Length = insertSize; rec2.Length = -insertSize; }
int main (int argc, char *argv[]) { // bool mapped =false; // bool unmapped=false; int bpToDecrease5=1; int bpToDecrease3=2; const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+ "\tThis program takes a BAM file as input and produces\n"+ "\tanother where the putative deaminated bases have\n"+ "\ta base quality score of "+intStringify(baseQualForDeam)+"\n"+ "\tgiven an "+intStringify(offset)+" offset \n"+ "\n"+ "\tOptions:\n"+ "\t\t"+"-n5" +"\t\t\t"+"Decrease the nth bases surrounding the 5' ends (Default:"+stringify(bpToDecrease5)+") "+"\n"+ "\t\t"+"-n3" +"\t\t\t"+"Decrease the nth bases surrounding the 3' ends (Default:"+stringify(bpToDecrease3)+") "+"\n" ); // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+ // "\t"+"-m , --mapped" +"\n\t\t"+"For an mapped bam file"+"\n"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<usage<<endl; cout<<""<<endl; return 1; } for(int i=1;i<(argc-2);i++){ //all but the last arg if( string(argv[i]) == "-n5" ){ bpToDecrease5 = destringify<int>(argv[i+1]); i++; continue; } if( string(argv[i]) == "-n3" ){ bpToDecrease3 = destringify<int>(argv[i+1]); i++; continue; } cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; return 1; } if(argc < 3){ cerr<<"Error: Must specify the input and output BAM files"; return 1; } string inbamFile =argv[argc-2]; string outbamFile=argv[argc-1]; if(inbamFile == outbamFile){ cerr<<"Input and output files are the same"<<endl; return 1; } // if(!mapped && !unmapped){ // cerr << "Please specify whether you reads are mapped or unmapped" << endl; // return 1; // } // if(mapped && unmapped){ // cerr << "Please specify either mapped or unmapped but not both" << endl; // return 1; // } BamReader reader; if ( !reader.Open(inbamFile) ) { cerr << "Could not open input BAM files." << endl; return 1; } vector<RefData> testRefData=reader.GetReferenceData(); SamHeader header = reader.GetHeader(); string pID = "decrQualDeaminated"; string pName = "decrQualDeaminated"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outbamFile, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; // BamAlignment al2; // bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsPaired() ){ if(al.IsFirstMate() ){ //5' end, need to check first base only if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl; //return 1; } int indexToCheck; //5' of first mate reversed indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease5;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } }else{ int indexToCheck; //5' of first mate indexToCheck=0; for(int i=0;i<bpToDecrease5;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } } }else{ //3' end, need to check last two bases only if( al.IsSecondMate() ){ if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl; //return 1; } int indexToCheck; //3' of second mate reversed indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease3;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } }else{ int indexToCheck; //3' of second mate forward indexToCheck=0; for(int i=0;i<bpToDecrease3;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } } }else{ cerr << "Wrong state" << endl; return 1; } } }//end of paired end else{//we consider single reads to have been sequenced from 5' to 3' if(al.IsReverseStrand()){ //need to consider if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl; //return 1; } int indexToCheck; //5' of single read reversed indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease5;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } //3' of single read reversed indexToCheck=0; for(int i=0;i<bpToDecrease3;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } }else{ int indexToCheck; //5' of single read indexToCheck=0; for(int i=0;i<bpToDecrease5;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } //3' of single read indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease3;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } } }//end of single end writer.SaveAlignment(al); }// while ( reader.GetNextAlignment(al) ) { reader.Close(); writer.Close(); cerr<<"Program terminated gracefully"<<endl; return 0; }
bool check(const PropertyFilter& filter, const BamAlignment& al) { bool keepAlignment = true; const PropertyMap& properties = filter.Properties; PropertyMap::const_iterator propertyIter = properties.begin(); PropertyMap::const_iterator propertyEnd = properties.end(); for ( ; propertyIter != propertyEnd; ++propertyIter ) { // check alignment data field depending on propertyName const string& propertyName = (*propertyIter).first; const PropertyFilterValue& valueFilter = (*propertyIter).second; if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); else if ( propertyName == CIGAR_PROPERTY ) { stringstream cigarSs; const vector<CigarOp>& cigarData = al.CigarData; if ( !cigarData.empty() ) { vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); cigarSs << op.Length << op.Type; } keepAlignment &= valueFilter.check(cigarSs.str()); } } else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate()); else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped()); else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped()); else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired()); else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); else if ( propertyName == ISSINGLETON_PROPERTY ) { const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); keepAlignment &= valueFilter.check(isSingleton); } else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { if ( !al.IsPaired() || !al.IsMateMapped() ) return false; BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID"); const string& refName = filterToolReferences.at(al.MateRefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name); else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position); else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases); else if ( propertyName == REFERENCE_PROPERTY ) { BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID"); const string& refName = filterToolReferences.at(al.RefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al); else BAMTOOLS_ASSERT_UNREACHABLE; // if alignment fails at ANY point, just quit and return false if ( !keepAlignment ) return false; } BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); return keepAlignment; }
void BedWindow::WindowIntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // vector of potential hits // reserve some space hits.reserve(100); _bedA->bedType = 6; BamAlignment bam; bool overlapsFound; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { overlapsFound = FindOneOrMoreWindowOverlaps(a); if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) writer.SaveAlignment(bam); } } else { FindWindowOverlaps(a, hits); hits.clear(); } } // BAM IsMapped() is false else if (_noHit == true) { writer.SaveAlignment(bam); } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
void TagBam::Tag() { // open the annotations files for processing; OpenAnnoFiles(); // open the BAM file BamReader reader; BamWriter writer; if (!reader.Open(_bamFile)) { cerr << "Failed to open BAM file " << _bamFile << endl; exit(1); } // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; // if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); // rip through the BAM file and test for overlaps with each annotation file. BamAlignment al; vector<BED> hits; while (reader.GetNextAlignment(al)) { if (al.IsMapped() == true) { BED a; a.chrom = refs.at(al.RefID).RefName; a.start = al.Position; a.end = al.GetEndPosition(false, false); a.strand = "+"; if (al.IsReverseStrand()) a.strand = "-"; ostringstream annotations; // annotate the BAM file based on overlaps with the annotation files. for (size_t i = 0; i < _annoFiles.size(); ++i) { // grab the current annotation file. BedFile *anno = _annoFiles[i]; if (!_useNames && !_useScores && !_useIntervals) { // add the label for this annotation file to tag if there is overlap if (anno->anyHits(a.chrom, a.start, a.end, a.strand, _sameStrand, _diffStrand, _overlapFraction, false)) { annotations << _annoLabels[i] << ";"; } } // use the score field else if (!_useNames && _useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t i = 0; i < hits.size(); ++i) { annotations << hits[i].score; if (i < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the name field from the annotation files to populate tag else if (_useNames && !_useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << hits[j].name; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the full interval information annotation files to populate tag else if (!_useNames && !_useScores && _useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << _annoLabels[i] << ":" << hits[j].chrom << ":" << hits[j].start << "-" << hits[j].end << "," << hits[j].name << "," << hits[j].score << "," << hits[j].strand; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } } // were there any overlaps with which to make a tag? if (annotations.str().size() > 0) { al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";" } } writer.SaveAlignment(al); } reader.Close(); writer.Close(); // close the annotations files; CloseAnnoFiles(); }
int main (int argc, char *argv[]) { string usage=string(""+string(argv[0])+" [in BAM file]"+ "\nThis program reads a BAM file and computes the error rate for each cycle\n"+ // "\nreads and the puts the rest into another bam file.\n"+ // "\nTip: if you do not need one of them, use /dev/null as your output\n"+ // "arguments:\n"+ // "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\n"); if(argc == 1 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } // for(int i=1;i<(argc-2);i++){ // if(string(argv[i]) == "--bq"){ // minBaseQuality=destringify<int>(argv[i+1]); // i++; // continue; // } // } string bamfiletopen = string( argv[ argc-1 ] ); // string deambam = string( argv[ argc-2 ] ); // string nondeambam = string( argv[ argc-1 ] ); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } //iterating over the alignments for these regions BamAlignment al; bool pairedEnd=false; bool firstRead=true; while ( reader.GetNextAlignment(al) ) { if(firstRead){ //reads are either all paired end or single end, I don't allow a mix numberOfCycles=al.QueryBases.size(); // cout<<"numberOfCycles "<<numberOfCycles<<endl; if(al.IsPaired() ){ pairedEnd=true; matches = vector<unsigned int> (2*numberOfCycles,0); mismatches = vector<unsigned int> (2*numberOfCycles,0); typesOfMismatches = vector< vector<unsigned int> >(); for(int i=0;i<12;i++) typesOfMismatches.push_back( vector<unsigned int> (2*numberOfCycles,0) ); }else{ matches = vector<unsigned int> ( numberOfCycles,0); mismatches = vector<unsigned int> ( numberOfCycles,0); typesOfMismatches = vector< vector<unsigned int> >(); for(int i=0;i<12;i++) typesOfMismatches.push_back( vector<unsigned int> ( numberOfCycles,0) ); } firstRead=false; } if( ( pairedEnd && !al.IsPaired()) || ( !pairedEnd && al.IsPaired()) ){ cerr<<"Read "<<al.Name<<" is wrong, cannot have a mixture of paired and unpaired read for this program"<<endl; return 1; } //skip unmapped if(!al.IsMapped()) continue; if(numberOfCycles!=int(al.QueryBases.size())){ cerr<<"The length of read "<<al.Name<<" is wrong, should be "<<numberOfCycles<<"bp"<<endl; return 1; } string reconstructedReference = reconstructRef(&al); if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } if( pairedEnd ){ if( al.IsFirstMate() ){ //start cycle 0 if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,0 , 1); //start cycle 0 } }else{ if( al.IsSecondMate() ){ if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,2*numberOfCycles-1,-1); //start cycle 2*numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,numberOfCycles , 1); //start cycle numberOfCycles } }else{ cerr<<"Reads "<<al.Name<<" must be either first or second mate"<<endl; return 1; } } }else{ //single end if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,0 , 1); //start cycle 0 } } }//end while each read reader.Close(); cout<<"cycle\tmatches\tmismatches\tmismatches%\tA>C\tA>C%\tA>G\tA>G%\tA>T\tA>T%\tC>A\tC>A%\tC>G\tC>G%\tC>T\tC>T%\tG>A\tG>A%\tG>C\tG>C%\tG>T\tG>T%\tT>A\tT>A%\tT>C\tT>C%\tT>G\tT>G%"<<endl; for(unsigned int i=0;i<matches.size();i++){ cout<<(i+1); if( (matches[i]+mismatches[i]!=0) ) cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\t"<< 100.0*(double(mismatches[i])/double(matches[i]+mismatches[i])) ; else cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\tNA"; for(int j=0;j<12;j++){ cout<<"\t"<<typesOfMismatches[j][i]; if( (matches[i]+mismatches[i]!=0) ) cout<<"\t"<<100.0*double(typesOfMismatches[j][i])/double(matches[i]+mismatches[i]); else cout<<"\tNA"; } cout<<endl; } return 0; }
int main (int argc, const char *argv[]) { printf ("------------- bamrealignment --------------\n"); OptArgs opts; opts.ParseCmdLine(argc, argv); vector<int> score_vals(4); string input_bam = opts.GetFirstString ('i', "input", ""); string output_bam = opts.GetFirstString ('o', "output", ""); opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores"); int clipping = opts.GetFirstInt ('c', "clipping", 2); bool anchors = opts.GetFirstBoolean ('a', "anchors", true); int bandwidth = opts.GetFirstInt ('b', "bandwidth", 10); bool verbose = opts.GetFirstBoolean ('v', "verbose", false); bool debug = opts.GetFirstBoolean ('d', "debug", false); int format = opts.GetFirstInt ('f', "format", 1); int num_threads = opts.GetFirstInt ('t', "threads", 8); string log_fname = opts.GetFirstString ('l', "log", ""); if (input_bam.empty() or output_bam.empty()) return PrintHelp(); opts.CheckNoLeftovers(); std::ofstream logf; if (log_fname.size ()) { logf.open (log_fname.c_str ()); if (!logf.is_open ()) { fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str()); return 1; } } BamReader reader; if (!reader.Open(input_bam)) { fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str()); return 1; } SamHeader header = reader.GetHeader(); RefVector refs = reader.GetReferenceData(); BamWriter writer; writer.SetNumThreads(num_threads); if (format == 1) writer.SetCompressionMode(BamWriter::Uncompressed); else writer.SetCompressionMode(BamWriter::Compressed); if (!writer.Open(output_bam, header, refs)) { fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str()); return 1; } // The meat starts here ------------------------------------ if (verbose) cout << "Verbose option is activated, each alignment will print to screen." << endl << " After a read hit RETURN to continue to the next one," << endl << " or press q RETURN to quit the program," << endl << " or press s Return to silence verbose," << endl << " or press c RETURN to continue printing without further prompt." << endl << endl; unsigned int readcounter = 0; unsigned int mapped_readcounter = 0; unsigned int realigned_readcounter = 0; unsigned int modified_alignment_readcounter = 0; unsigned int pos_update_readcounter = 0; unsigned int failed_clip_realigned_readcount = 0; unsigned int already_perfect_readcount = 0; unsigned int bad_md_tag_readcount = 0; unsigned int error_recreate_ref_readcount = 0; unsigned int error_clip_anchor_readcount = 0; unsigned int error_sw_readcount = 0; unsigned int error_unclip_readcount = 0; unsigned int start_position_shift; int orig_position; int new_position; string md_tag, new_md_tag, input = "x"; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; bool position_shift = false; time_t start_time = time(NULL); Realigner aligner; aligner.verbose_ = verbose; aligner.debug_ = debug; if (!aligner.SetScores(score_vals)) cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl; aligner.SetAlignmentBandwidth(bandwidth); BamAlignment alignment; while(reader.GetNextAlignment(alignment)){ readcounter ++; position_shift = false; if ( (readcounter % 100000) == 0 ) cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl; if (alignment.IsMapped()) { orig_position = alignment.Position; mapped_readcounter++; aligner.SetClipping(clipping, !alignment.IsReverseStrand()); if (aligner.verbose_) { cout << endl; if (alignment.IsReverseStrand()) cout << "The read is from the reverse strand." << endl; else cout << "The read is from the forward strand." << endl; } if (!alignment.GetTag("MD", md_tag)) { if (aligner.verbose_) cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n'; bad_md_tag_readcount++; } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) { bool clipfail = false; if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ()) { clipfail = true; failed_clip_realigned_readcount ++; } if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (aligner.verbose_) cout << "Error in the alignment! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n'; error_sw_readcount++; writer.SaveAlignment(alignment); // Write alignment unchanged continue; } if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) { if (aligner.verbose_) cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n'; writer.SaveAlignment(alignment); // Write alignment unchanged error_unclip_readcount ++; continue; } new_md_tag = aligner.GetMDstring(new_md_data); realigned_readcounter++; // adjust start position of read if (!aligner.LeftAnchorClipped() and start_position_shift != 0) { new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position); if (new_position != alignment.Position) { pos_update_readcounter++; position_shift = true; alignment.Position = new_position; } } if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag) { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD"; if (position_shift) logf << "-SHIFT"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } modified_alignment_readcounter++; } else { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } } if (aligner.verbose_){ cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } // Finally update alignment information alignment.CigarData = new_cigar_data; alignment.EditTag("MD", "Z" , new_md_tag); } // end of CreateRef else if else { switch (aligner.GetCreateRefError ()) { case Realigner::CR_ERR_RECREATE_REF: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n'; error_recreate_ref_readcount++; break; case Realigner::CR_ERR_CLIP_ANCHOR: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n'; error_clip_anchor_readcount++; break; default: // On a good run this writes way too many reads to the log file - don't want to create a too large txt file // if (logf.is_open ()) //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n'; already_perfect_readcount++; break; } if (aligner.verbose_) { cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } } // --- Debug output for Rajesh --- if (debug && aligner.invalid_cigar_in_input) { aligner.verbose_ = true; cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl; // Rerun reference generation to display error aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors); aligner.verbose_ = verbose; aligner.invalid_cigar_in_input = false; } // --- --- --- } // end of if isMapped writer.SaveAlignment(alignment); } // end while loop over reads if (aligner.invalid_cigar_in_input) cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl; // ---------------------------------------------------------------- // program end -- output summary information cout << " File: " << input_bam << endl << " Total reads: " << readcounter << endl << " Mapped reads: " << mapped_readcounter << endl; if (bad_md_tag_readcount) cout << " Skipped: bad MD tags: " << bad_md_tag_readcount << endl; if (error_recreate_ref_readcount) cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl; if (error_clip_anchor_readcount) cout << " Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl; cout << " Skipped: already perfect: " << already_perfect_readcount << endl << " Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl; if (failed_clip_realigned_readcount) cout << " (including " << failed_clip_realigned_readcount << " that failed to clip)" << endl; if (error_sw_readcount) cout << " Failed to complete SW alignment: " << error_sw_readcount << endl; if (error_unclip_readcount) cout << " Failed to unclip anchor: " << error_unclip_readcount << endl; cout << " Succesfully realigned: " << realigned_readcounter << endl << " Modified alignments: " << modified_alignment_readcounter << endl << " Shifted position: " << pos_update_readcounter << endl; cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl; cout << "INFO: The output BAM file may be unsorted." << endl; cout << "------------------------------------------" << endl; return 0; }
int main(int argc, char* argv[]) { // validate argument count if( argc != 2 ) { cerr << "USAGE: " << argv[0] << " <input BAM file> " << endl; return EXIT_FAILURE; } string filename = argv[1]; //cerr << "Printing alignments from file: " << filename << endl; BamReader reader; if (!reader.Open(filename)) { cerr << "could not open filename " << filename << endl; return EXIT_FAILURE; } cerr << filename << ": Done opening" << endl; // Header can't be used to accurately determine sort order because samtools never // changes it; instead, check after loading each read as is done with "samtools index" // We don't need to load an index (right?) // if (!reader.LocateIndex()) { // const string index_filename = filename + ".bai"; // if (!reader.OpenIndex(index_filename)) { // cerr << "could not open index" << endl; // } // } const SamHeader header = reader.GetHeader(); cerr << filename << ": Done getting header" << endl; const RefVector refs = reader.GetReferenceData(); cerr << filename << ": Done getting reference data" << endl; BamWriter writer; if (! output_bam_filename.empty()) { if (! writer.Open(output_bam_filename, header, refs)) { cerr << "Could not open BAM output file " << output_bam_filename << endl; return EXIT_FAILURE; } cerr << filename << ": Done opening BAM output file " << output_bam_filename << endl; } alignmentMap read1Map; // a single map, for all reads awaiting their mate typedef map<string,int32_t> stringMap; typedef stringMap::iterator stringMapI; stringMap ref_mates; // alignmentMap read1Map, read2Map; BamAlignment full_al; int32_t count = 0; uint32_t max_reads_in_map = 0; int32_t n_reads_skipped_unmapped = 0; int32_t n_reads_skipped_mate_unmapped = 0; int32_t n_reads_skipped_wont_see_mate = 0; int32_t n_reads_skipped_mate_tail_est = 0; int32_t n_reads_skipped_ref_mate = 0; int32_t n_reads = 0; int32_t n_singleton_reads = 0; int32_t last_RefID = -1; int32_t last_Position = -1; cerr << filename << ": Looking for up to " << pairs_to_process << " link pairs," << " total tail = " << link_pair_total_tail << " critical tail = " << link_pair_crit_tail << ", must be on diff chromosome = " << link_pair_diff_chrom << endl; while (reader.GetNextAlignment(full_al) && (! pairs_to_process || count < pairs_to_process)) { BamAlignment al = full_al; //printAlignmentInfo(al, refs); //++count; ++n_reads; if (last_RefID < 0) last_RefID = al.RefID; if (last_Position < 0) last_Position = al.Position; if (al.RefID > last_RefID) { // We've moved to the next reference sequence // Clean up reads with mates expected here that haven't been seen if (debug_ref_mate) { cerr << "MISSED " << ref_mates.size() << " ref_mates on this reference " << last_RefID << " " << refs[last_RefID].RefName << endl; } for (stringMapI rmI = ref_mates.begin(); rmI != ref_mates.end(); ++rmI) { ++n_reads_skipped_ref_mate; read1Map.erase(read1Map.find(rmI->first)); ref_mates.erase(ref_mates.find(rmI->first)); } last_RefID = al.RefID; last_Position = al.Position; } else if (! isCoordinateSorted(al.RefID, al.Position, last_RefID, last_Position)) { cerr << filename << " is not sorted, " << al.Name << " out of position" << endl; return EXIT_FAILURE; } if (! al.IsMapped()) { ++n_reads_skipped_unmapped; continue; } if (! al.IsMateMapped()) { ++n_reads_skipped_mate_unmapped; continue; } alignmentMapI mI = read1Map.find(al.Name); if (mI == read1Map.end()) { // the read name has not been seen before if (al.MateRefID < al.RefID || (al.MateRefID == al.RefID && al.MatePosition < al.Position)) { // we should have seen its mate earlier, so skip it ++n_reads_skipped_wont_see_mate; continue; } // If the mate likely to also be a link pair candidate, add the read int32_t mate_tail_est = readTailS(al.IsMateMapped(), al.IsMateReverseStrand(), al.MatePosition, refs[al.MateRefID].RefLength, max_read_length); if (mate_tail_est <= mate_tail_est_crit) { // the mate tail estimate suggests it might be a link pair candidate read1Map[al.Name] = al; // add the read to the map } else { // the mate tail estimate appears too long for the mate to be a candidate ++n_reads_skipped_mate_tail_est; continue; } if (read1Map.size() > max_reads_in_map) max_reads_in_map = read1Map.size(); if (al.MateRefID == al.RefID && al.MatePosition >= al.Position) { // the mate is expected later on this contig ref_mates[al.Name] = al.MateRefID; } } else { // get the mate's alignment, and process the pair const BamAlignment& al_mate = mI->second; if (processReadPair(al, al_mate, refs, link_pair_total_tail, link_pair_crit_tail, link_pair_diff_chrom)) { ++count; // write to the new BAM file, if the string is not empty if (! output_bam_filename.empty()) { writer.SaveAlignment(al_mate); // the first one seen writer.SaveAlignment(al); // the second one seen } } read1Map.erase(mI); if (al.MateRefID == al.RefID) { stringMapI rmI = ref_mates.find(al.Name); if (rmI == ref_mates.end()) { cerr << "expected a ref_mate, couldn't find its name: " << al.Name << endl; return EXIT_FAILURE; } ref_mates.erase(rmI); } } } cerr << "===============================" << endl; cerr << read1Map.size() << " alignments left in read1Map" << endl; cerr << max_reads_in_map << " maximum number of reads in read1Map" << endl; cerr << count << " pairs processed" << endl; cerr << "===============================" << endl; cerr << n_reads << " total reads" << endl; cerr << n_singleton_reads << " singleton reads" << endl; cerr << n_reads_skipped_unmapped << " reads skipped because unmapped" << endl; cerr << n_reads_skipped_mate_unmapped << " reads skipped because mate unmapped" << endl; cerr << n_reads_skipped_wont_see_mate << " reads skipped because mate won't be seen" << endl; cerr << n_reads_skipped_mate_tail_est << " reads skipped because mate tail appears too long" << endl; cerr << n_reads_skipped_ref_mate << " reads skipped because mate not on reference" << endl; reader.Close(); if (! output_bam_filename.empty()) { writer.Close(); } return EXIT_SUCCESS; }
void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, BamWriter &writer) { vector<BED> hits, hits1, hits2; // vector of potential hits hits.reserve(1000); // reserve some space hits1.reserve(1000); hits2.reserve(1000); bool overlapsFound; // flag to indicate if overlaps were found if ( (_searchType == "either") || (_searchType == "xor") || (_searchType == "both") || (_searchType == "notboth") || (_searchType == "neither") ) { // create a new BEDPE feature from the BAM alignments. BEDPE a; ConvertBamToBedPE(bam1, bam2, refs, a); if (_bamOutput == true) { // BAM output // write to BAM if correct hits found overlapsFound = FindOneOrMoreOverlaps(a, _searchType); if (overlapsFound == true) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } else { // BEDPE output FindOverlaps(a, hits1, hits2, _searchType); hits1.clear(); hits2.clear(); } } else if ( (_searchType == "ispan") || (_searchType == "ospan") ) { // only look for ispan and ospan when both ends are mapped. if (bam1.IsMapped() && bam2.IsMapped()) { // only do an inspan or outspan check if the alignment is intrachromosomal if (bam1.RefID == bam2.RefID) { // create a new BEDPE feature from the BAM alignments. BEDPE a; ConvertBamToBedPE(bam1, bam2, refs, a); if (_bamOutput == true) { // BAM output // look for overlaps, and write to BAM if >=1 were found overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); if (overlapsFound == true) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } else { // BEDPE output FindSpanningOverlaps(a, hits, _searchType); hits.clear(); } } } } else if ( (_searchType == "notispan") || (_searchType == "notospan") ) { // only look for notispan and notospan when both ends are mapped. if (bam1.IsMapped() && bam2.IsMapped()) { // only do an inspan or outspan check if the alignment is intrachromosomal if (bam1.RefID == bam2.RefID) { // create a new BEDPE feature from the BAM alignments. BEDPE a; ConvertBamToBedPE(bam1, bam2, refs, a); if (_bamOutput == true) { // BAM output // write to BAM if there were no overlaps overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); if (overlapsFound == false) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } else { // BEDPE output FindSpanningOverlaps(a, hits, _searchType); hits.clear(); } } // if inter-chromosomal or orphaned, we know it's not ispan and not ospan else if (_bamOutput == true) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } // if both ends aren't mapped, we know that it's notispan and not ospan else if (_bamOutput == true) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } }
int main (int argc, char *argv[]) { bool produceUnCompressedBAM=false; bool verbose=false; bool ancientDNA=false; bool keepOrig=false; string adapter_F=options_adapter_F_BAM; string adapter_S=options_adapter_S_BAM; string adapter_chimera=options_adapter_chimera_BAM; string key=""; bool allowMissing=false; int trimCutoff=1; bool allowAligned=false; bool printLog=false; string logFileName; BamReader reader; BamWriter writer; string bamFile; string bamFileOUT=""; string key1; string key2; bool useDist=false; double location=-1.0; double scale =-1.0; bool fastqFormat=false; string fastqfile1 = ""; string fastqfile2 = ""; string fastqoutfile = ""; bool singleEndModeFQ=true; const string usage=string(string(argv[0])+ " [options] BAMfile"+"\n"+ "\nThis program takes an unaligned BAM where mates are consecutive\nor fastq files and trims and merges reads\n"+ "\n\tYou can specify a unaligned bam file or one or two fastq :\n"+ "\t\t"+"-fq1" +"\t\t"+"First fastq"+"\n"+ "\t\t"+"-fq2" +"\t\t"+"Second fastq file (for paired-end)"+"\n"+ "\t\t"+"-fqo" +"\t\t"+"Output fastq prefix"+"\n\n"+ //"\t"+"-p , --PIPE"+"\n\t\t"+"Read BAM from and write it to PIPE"+"\n"+ "\t"+"-o , --outfile" +"\t\t"+"Output (BAM format)."+"\n"+ "\t"+"-u " +"\t\t"+"Produce uncompressed bam (good for pipe)"+"\n"+ // "\t"+" , --outprefix" +"\n\t\t"+"Prefix for output files (default '"+outprefix+"')."+"\n"+ //"\t"+" , --SAM" +"\n\t\t"+"Output SAM not BAM."+"\n"+ "\t"+"--aligned" +"\t\t"+"Allow reads to be aligned (default "+boolStringify(allowAligned)+")"+"\n"+ "\t"+"-v , --verbose" +"\t\t"+"Turn all messages on (default "+boolStringify(verbose)+")"+"\n"+ "\t"+"--log [log file]" +"\t"+"Print a tally of merged reads to this log file (default only to stderr)"+"\n"+ "\n\t"+"Paired End merging/Single Read trimming options"+"\n"+ "\t\t"+"You can specify either:"+"\n"+ "\t\t\t"+"--ancientdna"+"\t\t\t"+"ancient DNA (default "+boolStringify(ancientDNA)+")"+"\n"+ "\t\t"+" "+"\t\t\t\t"+"this allows for partial overlap"+"\n"+ "\n\t\t"+"or if you know your size length distribution:"+"\n"+ "\t\t\t"+"--loc"+"\t\t\t\t"+"Location for lognormal dist. (default none)"+"\n"+ "\t\t\t"+"--scale"+"\t\t\t\t"+"Scale for lognormal dist. (default none)"+"\n"+ // "\t\t\t\t\t\t\tGood for merging ancient DNA reads into a single sequence\n\n" "\n\t\t"+"--keepOrig"+"\t\t\t\t"+"Write original reads if they are trimmed or merged (default "+boolStringify(keepOrig)+")"+"\n"+ "\t\t\t\t\t\t\tSuch reads will be marked as PCR duplicates\n\n" "\t\t"+"-f , --adapterFirstRead" +"\t\t\t"+"Adapter that is observed after the forward read (def. Multiplex: "+options_adapter_F_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-s , --adapterSecondRead" +"\t\t"+"Adapter that is observed after the reverse read (def. Multiplex: "+options_adapter_S_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-c , --FirstReadChimeraFilter" +"\t\t"+"If the forward read looks like this sequence, the cluster is filtered out.\n\t\t\t\t\t\t\tProvide several sequences separated by comma (def. Multiplex: "+options_adapter_chimera_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-k , --key"+"\t\t\t\t"+"Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (default '"+key+"')"+"\n"+ "\t\t"+"-i , --allowMissing"+"\t\t\t"+"Allow one base in one key to be missing or wrong. (default "+boolStringify(allowMissing)+")"+"\n"+ "\t\t"+"-t , --trimCutoff"+"\t\t\t"+"Lowest number of adapter bases to be observed for single Read trimming (default "+stringify(trimCutoff)+")"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<usage<<endl; return 1; } for(int i=1;i<(argc-1);i++){ //all but the last arg if(strcmp(argv[i],"-fq1") == 0 ){ fastqfile1=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"-fq2") == 0 ){ fastqfile2=string(argv[i+1]); fastqFormat=true; singleEndModeFQ=false; i++; continue; } if(strcmp(argv[i],"-fqo") == 0 ){ fastqoutfile=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"--log") == 0 ){ logFileName =string(argv[i+1]); printLog=true; i++; continue; } if(strcmp(argv[i],"-p") == 0 || strcmp(argv[i],"--PIPE") == 0 ){ cerr<<"This version no longer works with pipe, exiting"<<endl; return 1; } if(strcmp(argv[i],"-u") == 0 ){ produceUnCompressedBAM=true; continue; } if(strcmp(argv[i],"--aligned") == 0 ){ allowAligned=true; continue; } if(strcmp(argv[i],"-o") == 0 || strcmp(argv[i],"--outfile") == 0 ){ bamFileOUT =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-v") == 0 || strcmp(argv[i],"--verbose") == 0 ){ verbose=true; continue; } if(strcmp(argv[i],"--ancientdna") == 0 ){ ancientDNA=true; continue; } if(strcmp(argv[i],"--keepOrig") == 0 ){ keepOrig=true; continue; } if(strcmp(argv[i],"--loc") == 0 ){ location =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"--scale") == 0 ){ scale =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-f") == 0 || strcmp(argv[i],"--adapterFirstRead") == 0 ){ adapter_F =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-s") == 0 || strcmp(argv[i],"--adapterSecondRead") == 0 ){ adapter_S =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-c") == 0 || strcmp(argv[i],"--FirstReadChimeraFilter") == 0 ){ adapter_chimera =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-k") == 0 || strcmp(argv[i],"--keys") == 0 ){ key =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-i") == 0 || strcmp(argv[i],"--allowMissing") == 0 ){ allowMissing=true; continue; } if(strcmp(argv[i],"-t") == 0 || strcmp(argv[i],"--trimCutoff") == 0 ){ trimCutoff=atoi(argv[i+1]); i++; continue; } cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; return 1; } bamFile=argv[argc-1]; if( (location != -1.0 && scale == -1.0) || (location == -1.0 && scale != -1.0) ){ cerr<<"Cannot specify --location without specifying --scale"<<endl; return 1; } if( (location != -1.0 && scale != -1.0) ){ useDist=true; if(ancientDNA){ cerr<<"Cannot specify --location/--scale and --ancientDNA"<<endl; return 1; } } MergeTrimReads mtr (adapter_F,adapter_S,adapter_chimera, key1,key2, trimCutoff,allowMissing,ancientDNA,location,scale,useDist); fqwriters onereadgroup; if(fastqFormat){ if( bamFileOUT != "" || produceUnCompressedBAM || allowAligned){ cerr<<"ERROR : Cannot specify options like -o, -u or --allowAligned for fastq"<<endl; return 1; } if(fastqfile1 == ""){ cerr<<"ERROR : Must specify as least the first file for fastq"<<endl; return 1; } FastQParser * fqp1; FastQParser * fqp2; if(singleEndModeFQ){ fqp1 = new FastQParser (fastqfile1); string outdirs = fastqoutfile+".fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } }else{ fqp1 = new FastQParser (fastqfile1); fqp2 = new FastQParser (fastqfile2); string outdirs = fastqoutfile+".fq.gz"; string outdir1 = fastqoutfile+"_r1.fq.gz"; string outdir2 = fastqoutfile+"_r2.fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; string outdir1f = fastqoutfile+"_r1.fail.fq.gz"; string outdir2f = fastqoutfile+"_r2.fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.pairr1.open(outdir1.c_str(), ios::out); onereadgroup.pairr2.open(outdir2.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); onereadgroup.pairr1f.open(outdir1f.c_str(), ios::out); onereadgroup.pairr2f.open(outdir2f.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.pairr1.good()){ cerr<<"Cannot write to file "<<outdir1<<endl; return 1; } if(!onereadgroup.pairr2.good()){ cerr<<"Cannot write to file "<<outdir2<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } if(!onereadgroup.pairr1f.good()){ cerr<<"Cannot write to file "<<outdir1f<<endl; return 1; } if(!onereadgroup.pairr2f.good()){ cerr<<"Cannot write to file "<<outdir2f<<endl; return 1; } } unsigned int totalSeqs=0; while(fqp1->hasData()){ FastQObj * fo1=fqp1->getData(); vector<string> def1=allTokens( *(fo1->getID()), ' ' ); string def1s=def1[0]; FastQObj * fo2; string def2s; string ext2s; if(!singleEndModeFQ){ if(!fqp2->hasData()){ cerr << "ERROR: Discrepency between fastq files at record " << *(fo1->getID()) <<endl; return 1; } fo2=fqp2->getData(); vector<string> def2=allTokens( *(fo2->getID()), ' ' ); def2s=def2[0]; if(strEndsWith(def1s,"/1")){ def1s=def1s.substr(0,def1s.size()-2); } if(strEndsWith(def2s,"/2")){ def2s=def2s.substr(0,def2s.size()-2); } if(strBeginsWith(def1s,"@")){ def1s=def1s.substr(1,def1s.size()-1); } if(strBeginsWith(def2s,"@")){ def2s=def2s.substr(1,def2s.size()-1); } if(def1s != def2s){ cerr << "ERROR: Discrepency between fastq files, different names " << *(fo1->getID()) <<" and "<< *(fo2->getID()) <<endl; return 1; } merged result= mtr.process_PE(*(fo1->getSeq()),*(fo1->getQual()), *(fo2->getSeq()),*(fo2->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //keys or chimeras if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.pairr2f<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1f<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; }else{ if(result.sequence != ""){ //new sequence onereadgroup.single<<"@"<<def1s<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; if( result.sequence.length() > max(fo1->getSeq()->length(),fo2->getSeq()->length()) ){ mtr.incrementCountmergedoverlap(); }else{ mtr.incrementCountmerged(); } }else{ //keep as is mtr.incrementCountnothing(); onereadgroup.pairr2<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } }else{ merged result=mtr.process_SR(*(fo1->getSeq()),*(fo1->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //either chimera or missing key if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.singlef<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; } if(result.sequence != ""){ //new sequence mtr.incrementCounttrimmed(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; }else{ mtr.incrementCountnothing(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } totalSeqs++; } delete fqp1; if(!singleEndModeFQ){ delete fqp2; } if(singleEndModeFQ){ onereadgroup.single.close(); onereadgroup.singlef.close(); }else{ onereadgroup.single.close(); onereadgroup.pairr1.close(); onereadgroup.pairr2.close(); onereadgroup.singlef.close(); onereadgroup.pairr1f.close(); onereadgroup.pairr2f.close(); } //fastq }else{ //else BAM // initMerge(); // set_adapter_sequences(adapter_F, // adapter_S, // adapter_chimera); // set_options(trimCutoff,allowMissing,mergeoverlap); if(key != ""){ size_t found=key.find(","); if (found == string::npos){ //single end reads key1=key; key2=""; } else{ //paired-end key1=key.substr(0,found); key2=key.substr(found+1,key.length()-found+1); } } if( bamFileOUT == "" ){ cerr<<"The output must be a be specified, exiting"<<endl; return 1; } if ( !reader.Open(bamFile) ) { cerr << "Could not open input BAM file "<<bamFile << endl; return 1; } SamHeader header = reader.GetHeader(); string pID = "mergeTrimReadsBAM"; string pName = "mergeTrimReadsBAM"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),"..")); const RefVector references = reader.GetReferenceData(); //we will not call bgzip with full compression, good for piping into another program to //lessen the load on the CPU if(produceUnCompressedBAM) writer.SetCompressionMode(BamWriter::Uncompressed); if ( !writer.Open(bamFileOUT,header,references) ) { cerr << "Could not open output BAM file "<<bamFileOUT << endl; return 1; } SamHeader sh=reader.GetHeader(); //Up to the user to be sure that a sequence is followed by his mate // if(!sh.HasSortOrder() || // sh.SortOrder != "queryname"){ // cerr << "Bamfile must be sorted by queryname" << endl; // return 1; // } BamAlignment al; BamAlignment al2; bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsMapped() || al.HasTag("NM") || al.HasTag("MD") ){ if(!allowAligned){ cerr << "Reads should not be aligned" << endl; return 1; }else{ //should we remove tags ? } } if(al.IsPaired() && al2Null ){ al2=al; al2Null=false; continue; }else{ if(al.IsPaired() && !al2Null){ bool result = mtr.processPair(al,al2); if( result ){//was merged BamAlignment orig; BamAlignment orig2; if(keepOrig){ orig2 = al2; orig = al; } writer.SaveAlignment(al); if(keepOrig){ orig.SetIsDuplicate(true); orig2.SetIsDuplicate(true); writer.SaveAlignment(orig2); writer.SaveAlignment(orig); } //the second record is empty }else{ //keep the sequences as pairs writer.SaveAlignment(al2); writer.SaveAlignment(al); } // // SINGLE END // }else{ BamAlignment orig; if(keepOrig){ orig =al; } mtr.processSingle(al); if(keepOrig){ //write duplicate if(orig.QueryBases.length() != al.QueryBases.length()){ orig.SetIsDuplicate(true); writer.SaveAlignment(orig); } } writer.SaveAlignment(al); } //end single end al2Null=true; }//second pair } //while al reader.Close(); writer.Close(); } //else BAM cerr <<mtr.reportSingleLine()<<endl; if(printLog){ ofstream fileLog; fileLog.open(logFileName.c_str()); if (fileLog.is_open()){ fileLog <<mtr.reportMultipleLines() <<endl; }else{ cerr << "Unable to print to file "<<logFileName<<endl; } fileLog.close(); } return 0; }
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "arguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); //dummy reader, will need to reposition anyway VCFreader vcfr (vcffiletopen, vcffiletopen+".tbi", chrname, 1, 1, 0); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } // if ( !reader.LocateIndex() ) { // cerr << "The index for the BAM file cannot be located" << endl; // return 1; // } // if ( !reader.HasIndex() ) { // cerr << "The BAM file has not been indexed." << endl; // return 1; // } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A // if(toprint->hasAtLeastOneG() && // toprint->getAlt().find("A") == string::npos){ if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one C but no T if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
int IonstatsAlignment(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_alignment.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty()) { IonstatsAlignmentHelp(); return 1; } // // Prepare for metric calculation // BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } ReadLengthHistogram called_histogram; ReadLengthHistogram aligned_histogram; ReadLengthHistogram AQ7_histogram; ReadLengthHistogram AQ10_histogram; ReadLengthHistogram AQ17_histogram; ReadLengthHistogram AQ20_histogram; ReadLengthHistogram AQ47_histogram; SimpleHistogram error_by_position; called_histogram.Initialize(histogram_length); aligned_histogram.Initialize(histogram_length); AQ7_histogram.Initialize(histogram_length); AQ10_histogram.Initialize(histogram_length); AQ17_histogram.Initialize(histogram_length); AQ20_histogram.Initialize(histogram_length); AQ47_histogram.Initialize(histogram_length); error_by_position.Initialize(histogram_length); BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { // Record read length called_histogram.Add(alignment.Length); if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ7_bases = 0; int AQ10_bases = 0; int AQ17_bases = 0; int AQ20_bases = 0; int AQ47_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats alignment: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*5 <= num_bases) AQ7_bases = num_bases; if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; if (num_errors*100 <= num_bases) AQ20_bases = num_bases; if (num_errors == 0) AQ47_bases = num_bases; } // // Step 3. Profit // if (num_bases >= 20) aligned_histogram.Add(num_bases); if (AQ7_bases >= 20) AQ7_histogram.Add(AQ7_bases); if (AQ10_bases >= 20) AQ10_histogram.Add(AQ10_bases); if (AQ17_bases >= 20) AQ17_histogram.Add(AQ17_bases); if (AQ20_bases >= 20) AQ20_histogram.Add(AQ20_bases); if (AQ47_bases >= 20) AQ47_histogram.Add(AQ47_bases); } input_bam.Close(); // // Processing complete, generate ionstats_alignment.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_alignment"; output_json["meta"]["format_version"] = "1.0"; called_histogram.SaveToJson(output_json["full"]); aligned_histogram.SaveToJson(output_json["aligned"]); AQ7_histogram.SaveToJson(output_json["AQ7"]); AQ10_histogram.SaveToJson(output_json["AQ10"]); AQ17_histogram.SaveToJson(output_json["AQ17"]); AQ20_histogram.SaveToJson(output_json["AQ20"]); AQ47_histogram.SaveToJson(output_json["AQ47"]); error_by_position.SaveToJson(output_json["error_by_position"]); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } return 0; }
void PileupEngine::PileupEnginePrivate::ParseAlignmentCigar(const BamAlignment& al) { // skip if unmapped if ( !al.IsMapped() ) return; // intialize local variables int genomePosition = al.Position; int positionInAlignment = 0; bool isNewReadSegment = true; bool saveAlignment = true; PileupAlignment pileupAlignment(al); // iterate over CIGAR operations const int numCigarOps = (const int)al.CigarData.size(); for (int i = 0; i < numCigarOps; ++i ) { const CigarOp& op = al.CigarData.at(i); // if op is MATCH if ( op.Type == 'M' ) { // if match op overlaps current position if ( genomePosition + (int)op.Length > CurrentPosition ) { // set pileup data pileupAlignment.IsCurrentDeletion = false; pileupAlignment.IsNextDeletion = false; pileupAlignment.IsNextInsertion = false; pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition); // check for beginning of read segment if ( genomePosition == CurrentPosition && isNewReadSegment ) pileupAlignment.IsSegmentBegin = true; // if we're at the end of a match operation if ( genomePosition + (int)op.Length - 1 == CurrentPosition ) { // if not last operation if ( i < numCigarOps - 1 ) { // check next CIGAR op const CigarOp& nextOp = al.CigarData.at(i+1); // if next CIGAR op is DELETION if ( nextOp.Type == 'D') { pileupAlignment.IsNextDeletion = true; pileupAlignment.DeletionLength = nextOp.Length; } // if next CIGAR op is INSERTION else if ( nextOp.Type == 'I' ) { pileupAlignment.IsNextInsertion = true; pileupAlignment.InsertionLength = nextOp.Length; } // if next CIGAR op is either DELETION or INSERTION if ( nextOp.Type == 'D' || nextOp.Type == 'I' ) { // if there is a CIGAR op after the DEL/INS if ( i < numCigarOps - 2 ) { const CigarOp& nextNextOp = al.CigarData.at(i+2); // if next CIGAR op is clipping or ref_skip if ( nextNextOp.Type == 'S' || nextNextOp.Type == 'N' || nextNextOp.Type == 'H' ) pileupAlignment.IsSegmentEnd = true; } else { pileupAlignment.IsSegmentEnd = true; // if next CIGAR op is clipping or ref_skip if ( nextOp.Type == 'S' || nextOp.Type == 'N' || nextOp.Type == 'H' ) pileupAlignment.IsSegmentEnd = true; } } // otherwise else { // if next CIGAR op is clipping or ref_skip if ( nextOp.Type == 'S' || nextOp.Type == 'N' || nextOp.Type == 'H' ) pileupAlignment.IsSegmentEnd = true; } } // else this is last operation else pileupAlignment.IsSegmentEnd = true; } } // increment markers genomePosition += op.Length; positionInAlignment += op.Length; } // if op is DELETION else if ( op.Type == 'D' ) { // if deletion op overlaps current position if ( genomePosition + (int)op.Length > CurrentPosition ) { // set pileup data pileupAlignment.IsCurrentDeletion = true; pileupAlignment.IsNextDeletion = false; pileupAlignment.IsNextInsertion = true; pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition); } // increment marker genomePosition += op.Length; } // if op is REF_SKIP else if ( op.Type == 'N' ) { genomePosition += op.Length; } // if op is INSERTION or SOFT_CLIP else if ( op.Type == 'I' || op.Type == 'S' ) { positionInAlignment += op.Length; } // checl for beginning of new read segment if ( op.Type == 'N' || op.Type == 'S' || op.Type == 'H' ) isNewReadSegment = true; else isNewReadSegment = false; // if we've moved beyond current position if ( genomePosition > CurrentPosition ) { if ( op.Type == 'N' ) saveAlignment = false; // ignore alignment if REF_SKIP break; } } // save pileup position if flag is true if ( saveAlignment ) CurrentPileupData.PileupAlignments.push_back( pileupAlignment ); }
int main (int argc, char *argv[]) { bool mapped =false; bool unmapped=false; const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+ "This program takes a BAM file as input and produces\n"+ "another where the putative deaminated bases have\n"+ "have been cut\n"+ "\n"+ "Options:\n"); // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+ // "\t"+"-m , --mapped" +"\n\t\t"+"For an mapped bam file"+"\n"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<usage<<endl; cout<<""<<endl; return 1; } // for(int i=1;i<(argc-1);i++){ //all but the last arg // if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){ // mapped=true; // continue; // } // if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){ // unmapped=true; // continue; // } // cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; // return 1; // } if(argc != 3){ cerr<<"Error: Must specify the input and output BAM files"; return 1; } string inbamFile =argv[argc-2]; string outbamFile=argv[argc-1]; // if(!mapped && !unmapped){ // cerr << "Please specify whether you reads are mapped or unmapped" << endl; // return 1; // } // if(mapped && unmapped){ // cerr << "Please specify either mapped or unmapped but not both" << endl; // return 1; // } BamReader reader; if ( !reader.Open(inbamFile) ) { cerr << "Could not open input BAM files." << endl; return 1; } vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outbamFile, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; // BamAlignment al2; // bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsPaired() ){ if(al.IsFirstMate() ){ //5' end, need to check first base only if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } }else{ //3' end, need to check last two bases only if( al.IsSecondMate() ){ if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } } }else{ int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } } }else{ cerr << "Wrong state" << endl; return 1; } } }//end of paired end else{//we consider single reads to have been sequenced from 5' to 3' if(al.IsReverseStrand()){ //need to consider if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"51 "<<al.QueryBases<<endl; // cout<<"51 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"52 "<<al.QueryBases<<endl; // cout<<"52 "<<al.Qualities<<endl; }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"61 "<<al.QueryBases<<endl; // cout<<"61 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"62 "<<al.QueryBases<<endl; // cout<<"62 "<<al.Qualities<<endl; } } //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"21 "<<al.QueryBases<<endl; // cout<<"21 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"22 "<<al.QueryBases<<endl; // cout<<"22 "<<al.Qualities<<endl; } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"11 "<<al.QueryBases<<endl; // cout<<"11 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"12 "<<al.QueryBases<<endl; // cout<<"12 "<<al.Qualities<<endl; } //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"31 "<<al.QueryBases<<endl; // cout<<"31 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"32 "<<al.QueryBases<<endl; // cout<<"32 "<<al.Qualities<<endl; }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"41 "<<al.QueryBases<<endl; // cout<<"41 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"42 "<<al.QueryBases<<endl; // cout<<"42 "<<al.Qualities<<endl; } } } }//end of single end writer.SaveAlignment(al); }// while ( reader.GetNextAlignment(al) ) { reader.Close(); writer.Close(); return 0; }
int main_asequantmultirg(const vector<string> &all_args) { Init(all_args); cerr << "* Reading bam file " << endl; OpenBam(bam_reader, bam_file); bam_reader.OpenIndex(bam_file + ".bai"); vector<string> readGroupVector; SamHeader header = bam_reader.GetHeader(); SamReadGroupDictionary headerRG = header.ReadGroups; for (SamReadGroupIterator it = headerRG.Begin(); it != headerRG.End(); it ++) { readGroupVector.push_back(it -> ID); } vector<RefData> chroms = bam_reader.GetReferenceData(); cout << "#CHROM" << "\t" << "POS" << "\t" << "REF" << "\t" << "ALT"; for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++) { cout << "\t" << *it; } cout << endl; StlFor(chrom_idx, chroms) { string &chrom = chroms[chrom_idx].RefName; vector<Snp> snps = snps_by_chrom[chrom]; int s = 0; // Index into snp array BamAlignment bam; bam_reader.Jump(chrom_idx); string align; string qualities; cerr << "* On chrom " << chrom << endl; while (bam_reader.GetNextAlignment(bam) && bam.RefID == chrom_idx) { if (bam.MapQuality < min_map_qual || !bam.IsMapped()) continue; string currentRG; Assert(bam.GetReadGroup(currentRG)); int start = AlignStart(bam); int end = AlignEnd(bam); // Move the current SNP pointer so that it is ahead of the read's start (since bam alignments are in sorted order) while (s < snps.size() && snps[s].pos < start) ++s; // Stop everything if we have visited all SNPs on this chrom if (s >= snps.size()) break; // Find any/all SNPs that are within the bam alignment int n = 0; // Number of SNPs overlapped while ((s + n) < snps.size() && snps[s + n].pos < end) // Then it overlaps! ++n; // Now, look at each SNP and see which way it votes AlignedString(bam, align); AlignedQualities(bam, qualities); Assert(align.size() == qualities.size()); // Now, tally votes for (int i = 0; i < n; ++i) { Snp &snp = snps[s + i]; char base = align[snp.pos - start]; // Base from the read int qual = int(qualities[snp.pos - start]) - ascii_offset; // Base from the read //AssertMsg(qual >= 0 && qual <= 100, ToStr(qual) + "\n" + bam.Name + "\n" + CigarToStr(bam.CigarData) + "\n" + bam.QueryBases + "\n" + bam.Qualities); if (base == '-' || qual < min_base_qual) continue; map<string, Counts> &RG_counts = bam.IsReverseStrand() ? snp.rev : snp.fwd; map<string, Counts>::iterator searchIt = RG_counts.find(currentRG); if (searchIt == RG_counts.end()) { if (base == snp.ref) { RG_counts[currentRG].num_ref = 1; RG_counts[currentRG].num_alt = 0; RG_counts[currentRG].num_other = 0; } else if (base == snp.alt) { RG_counts[currentRG].num_ref = 0; RG_counts[currentRG].num_alt = 1; RG_counts[currentRG].num_other = 0; } else { RG_counts[currentRG].num_ref = 0; RG_counts[currentRG].num_alt = 0; RG_counts[currentRG].num_other = 1; } } else { if (base == snp.ref) { searchIt -> second.num_ref += 1; } else if (base == snp.alt) { searchIt -> second.num_alt += 1; } else { searchIt -> second.num_other += 1; } } } } // Output counts for (int s = 0; s < snps.size(); ++s) { cout << chrom << "\t" << snps[s].pos + 1 << "\t" << snps[s].ref << "\t" << snps[s].alt; for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++) { map<string, Counts>::iterator searchIt = snps[s].fwd.find(*it); if (searchIt != snps[s].fwd.end()) { cout << "\t" << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other << ","; } else { cout << "\t" << "0,0,0,"; } searchIt = snps[s].rev.find(*it); if (searchIt != snps[s].rev.end()) { cout << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other; } else { cout << "0,0,0"; } } cout << endl; } }
void realign_bam(Parameters& params) { FastaReference reference; reference.open(params.fasta_reference); bool suppress_output = false; int dag_window_size = params.dag_window_size; // open BAM file BamReader reader; if (!reader.Open("stdin")) { cerr << "could not open stdin for reading" << endl; exit(1); } BamWriter writer; if (!params.dry_run && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } // store the names of all the reference sequences in the BAM file map<int, string> referenceIDToName; vector<RefData> referenceSequences = reader.GetReferenceData(); int i = 0; for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) { referenceIDToName[i] = r->RefName; ++i; } vcf::VariantCallFile vcffile; if (!params.vcf_file.empty()) { if (!vcffile.open(params.vcf_file)) { cerr << "could not open VCF file " << params.vcf_file << endl; exit(1); } } else { cerr << "realignment requires VCF file" << endl; exit(1); } vcf::Variant var(vcffile); BamAlignment alignment; map<long int, vector<BamAlignment> > alignmentSortQueue; // get alignment // assemble DAG in region around alignment // loop for each alignment in BAM: // update DAG when current alignment gets close to edge of assembled DAG // attempt to realign if read has a certain number of mismatches + gaps or softclips, weighted by basequal // if alignment to DAG has fewer mismatches and gaps than original alignment, use it // flatten read into reference space (for now just output alleles from VCF un-spanned insertions) // write read to queue for streaming re-sorting (some positional change will occur) long int dag_start_position = 0; string currentSeqname; string ref; //vector<Cigar> cigars; // contains the Cigar strings of nodes in the graph //vector<long int> refpositions; // contains the reference start coords of nodes in the graph ReferenceMappings ref_map; gssw_graph* graph = gssw_graph_create(0); int8_t* nt_table = gssw_create_nt_table(); int8_t* mat = gssw_create_score_matrix(params.match, params.mism); int total_reads = 0; int total_realigned = 0; int total_improved = 0; bool emptyDAG = false; // if the dag is constructed over empty sequence // such as when realigning reads mapped to all-N sequence if (params.debug) { cerr << "about to start processing alignments" << endl; } while (reader.GetNextAlignment(alignment)) { string& seqname = referenceIDToName[alignment.RefID]; if (params.debug) { cerr << "--------------------------------------------" << endl << "processing alignment " << alignment.Name << " at " << seqname << ":" << alignment.Position << endl; } /* if (!alignment.IsMapped() && graph->size == 0) { if (params.debug) { cerr << "unable to build DAG using unmapped read " << alignment.Name << " @ " << seqname << ":" << alignment.Position << " no previous mapped read found and DAG currently empty" << endl; } alignmentSortQueue[dag_start_position+dag_window_size].push_back(alignment); continue; } */ ++total_reads; BamAlignment originalAlignment = alignment; long unsigned int initialAlignmentPosition = alignment.Position; //if (dag_start_position == 1) { // dag_start_position = max(1, (int)initialAlignmentPosition - dag_window_size/2); //} // should we construct a new DAG? do so when 3/4 of the way through the current one // center on current position + 1/2 dag window // TODO check this scheme using some scribbles on paper // alignment.IsMapped() if ((seqname != currentSeqname || ((alignment.Position + (alignment.QueryBases.size()/2) > (3*dag_window_size/4) + dag_start_position))) && alignment.Position < reference.sequenceLength(seqname)) { if (seqname != currentSeqname) { if (params.debug) { cerr << "switched ref seqs" << endl; } dag_start_position = max((long int) 0, (long int) (alignment.GetEndPosition() - dag_window_size/2)); // recenter DAG } else if (!ref_map.empty()) { dag_start_position = dag_start_position + dag_window_size/2; dag_start_position = max(dag_start_position, (long int) (alignment.GetEndPosition() - dag_window_size/2)); } else { dag_start_position = alignment.Position - dag_window_size/2; } dag_start_position = max((long int)0, dag_start_position); // TODO get sequence length and use to bound noted window size (edge case) //cerr << "getting ref " << seqname << " " << max((long int) 0, dag_start_position) << " " << dag_window_size << endl; // get variants for new DAG vector<vcf::Variant> variants; if (!vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size)) { // this is not necessarily an error; there should be a better way to check for VCF file validity /* cerr << "could not set region on VCF file to " << currentSeqname << ":" << dag_start_position << "-" << dag_start_position + ref.size() << endl; */ //exit(1); } else { // check first variant if (vcffile.getNextVariant(var)) { while (var.position <= dag_start_position + 1) { //cerr << "var position == dag_start_position " << endl; dag_start_position -= 1; vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); if (!vcffile.getNextVariant(var)) { break; } } } vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); while (vcffile.getNextVariant(var)) { if (params.debug) cerr << "getting variant at " << var.sequenceName << ":" << var.position << endl; //cerr << var.position << " + " << var.ref.length() << " <= " << dag_start_position << " + " << dag_window_size << endl; //cerr << var.position << " >= " << dag_start_position << endl; if (var.position + var.ref.length() <= dag_start_position + dag_window_size && var.position >= dag_start_position) { variants.push_back(var); } } } //cerr << "dag_start_position " << dag_start_position << endl; ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), dag_window_size); // 0/1 conversion // clear graph and metadata ref_map.clear(); //cigars.clear(); //refpositions.clear(); gssw_graph_destroy(graph); if (params.debug) { cerr << "constructing DAG" << endl; } // and build the DAG graph = gssw_graph_create(0); constructDAGProgressive(graph, ref_map, ref, seqname, variants, dag_start_position, nt_table, mat, params.flat_input_vcf); if (params.debug) { cerr << "graph has " << graph->size << " nodes" << endl; cerr << "DAG generated from input variants over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } if (params.display_dag) { gssw_graph_print(graph); /* for (Backbone::iterator b = backbone.begin(); b != backbone.end(); ++b) { cout << b->first << " " << b->first->id << " " << b->second.ref_position << " " << b->second.cigar << endl << b->first->seq << endl; } */ } if (graph->size == 1 && allN(ref) || graph->size == 0) { if (params.debug) { cerr << "DAG is empty (1 node, all N). Alignment is irrelevant." << endl; } emptyDAG = true; } else { emptyDAG = false; } } AlignmentStats stats_before; bool was_mapped = alignment.IsMapped(); bool has_realigned = false; if (was_mapped) { if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } } if (params.debug) { if (emptyDAG) { cerr << "cannot realign against empty (all-N single node) graph" << endl; } } if (!emptyDAG && shouldRealign(alignment, ref, dag_start_position, params, stats_before)) { ++total_realigned; if (params.debug) { cerr << "realigning: " << alignment.Name << " " << alignment.QueryBases << endl << " aligned @ " << alignment.Position << " to variant graph over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } //{ try { Cigar flat_cigar; string read = alignment.QueryBases; string qualities = alignment.Qualities; int score; long int position; string strand; gssw_graph_mapping* gm = gswalign(graph, ref_map, read, qualities, params, position, score, flat_cigar, strand, nt_table, mat); // gssw_graph_mapping_destroy(gm); if (params.dry_run) { if (strand == "-" && !alignment.IsMapped()) { read = reverseComplement(read); } cout << read << endl; cout << graph_mapping_to_string(gm) << endl; cout << score << " " << strand << " " << position << " " << flat_cigar << endl; } else { /* if (strand == "-") { read = reverseComplement(trace_report.read); } */ // TODO the qualities are not on the right side of the read if (strand == "-" && alignment.IsMapped()) { // if we're realigning, this is always true unless we swapped strands alignment.SetIsReverseStrand(true); //reverse(alignment.Qualities.begin(), alignment.Qualities.end()); // reverse qualities } //alignment.QueryBases = reverseComplement(trace_report.read); alignment.QueryBases = read; alignment.Qualities = qualities; alignment.Position = position;// + 1;// + 1;//(trace_report.node->position - 1) + trace_report.x; alignment.SetIsMapped(true); if (!alignment.MapQuality) { alignment.MapQuality = 20; // horrible hack... at least approximate with alignment mismatches against graph } // check if somehow we've ended up with an indel at the ends // if so, grab the reference sequence right beyond it and add // a single match to the cigar, allowing variant detection methods // to run on the results without internal modification Cigar& cigar = flat_cigar; //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; int flankSize = params.flatten_flank; if (cigar.front().isIndel() || (cigar.front().isSoftclip() && cigar.at(1).isIndel())) { alignment.Position -= flankSize; string refBase = reference.getSubSequence(seqname, alignment.Position, flankSize); if (cigar.front().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.begin(), alignment.QueryBases.begin()+cigar.front().length); alignment.Qualities.erase(alignment.Qualities.begin(), alignment.Qualities.begin()+cigar.front().length); cigar.erase(cigar.begin()); } alignment.QueryBases.insert(0, refBase); alignment.Qualities.insert(0, string(flankSize, shortInt2QualityChar(30))); Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); newCigar.append(flat_cigar); flat_cigar = newCigar; } if (cigar.back().isIndel() || (cigar.back().isSoftclip() && cigar.at(cigar.size()-2).isIndel())) { string refBase = reference.getSubSequence(seqname, alignment.Position + flat_cigar.refLen(), flankSize); if (cigar.back().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.end()-cigar.back().length, alignment.QueryBases.end()); alignment.Qualities.erase(alignment.Qualities.end()-cigar.back().length, alignment.Qualities.end()); cigar.pop_back(); } Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); flat_cigar.append(newCigar); //flat_cigar.append(newCigar); alignment.QueryBases.append(refBase); alignment.Qualities.append(string(flankSize, shortInt2QualityChar(30))); } flat_cigar.toCigarData(alignment.CigarData); //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } AlignmentStats stats_after; countMismatchesAndGaps(alignment, flat_cigar, ref, dag_start_position, stats_after, params.debug); /* if ((!was_mapped || (stats_before.softclip_qsum >= stats_after.softclip_qsum && stats_before.mismatch_qsum >= stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ /* if ((!was_mapped || (stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ // we accept the new alignment if... if (!was_mapped // it wasn't mapped previously // or if we have removed soft clips or mismatches (per quality) from the alignment //|| ((stats_before.softclip_qsum >= stats_after.softclip_qsum // && stats_before.mismatch_qsum >= stats_after.mismatch_qsum) || ((stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum) // and if we have added gaps, we have added them to remove mismatches or softclips && (stats_before.gaps >= stats_after.gaps // accept any time we reduce gaps while not increasing softclips/mismatches || (stats_before.gaps < stats_after.gaps // and allow gap increases when they improve the alignment && (stats_before.softclip_qsum + stats_before.mismatch_qsum > stats_after.softclip_qsum + stats_after.mismatch_qsum)))) // and the alignment must not have more than the acceptable number of gaps, softclips, or mismatches // as provided in input parameters && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { // keep the alignment // TODO require threshold of softclips to keep alignment (or count of gaps, mismatches,...) if (params.debug) { cerr << "realigned " << alignment.Name << " to graph, which it maps to with " << stats_after.mismatch_qsum << "q in mismatches and " << stats_after.softclip_qsum << "q in soft clips" << endl; } ++total_improved; has_realigned = true; } else { // reset to old version of alignment if (params.debug) { cerr << "failed realignment of " << alignment.Name << " to graph, which it maps to with: " << stats_after.mismatch_qsum << "q in mismatches " << "(vs " << stats_before.mismatch_qsum << "q before), and " << stats_after.softclip_qsum << "q in soft clips " << "(vs " << stats_before.softclip_qsum << "q before) " << endl; } has_realigned = false; alignment = originalAlignment; } } //} // try block } catch (...) { cerr << "exception when realigning " << alignment.Name << " at position " << referenceIDToName[alignment.RefID] << ":" << alignment.Position << " " << alignment.QueryBases << endl; // reset to original alignment has_realigned = false; alignment = originalAlignment; } } // ensure correct order if alignments move long int maxOutputPos = initialAlignmentPosition - dag_window_size; // if we switched sequences we need to flush out all the reads from the previous one string lastSeqname = currentSeqname; if (seqname != currentSeqname) { // so the max output position is set past the end of the last chromosome if (!currentSeqname.empty()) { maxOutputPos = reference.sequenceLength(currentSeqname) + dag_window_size; } currentSeqname = seqname; } if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { // except if we are running in unsorted mode, stop when we are at the window size if (!params.unsorted_output && p->first > maxOutputPos) { break; // no more to do } else { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) { writer.SaveAlignment(*a); } } } if (p != alignmentSortQueue.begin()) { alignmentSortQueue.erase(alignmentSortQueue.begin(), p); } if (!params.only_realigned || has_realigned) { alignmentSortQueue[alignment.Position].push_back(alignment); } } } // end GetNextAlignment loop if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) writer.SaveAlignment(*a); } } gssw_graph_destroy(graph); free(nt_table); free(mat); reader.Close(); writer.Close(); if (params.debug) { cerr << "total reads:\t" << total_reads << endl; cerr << "realigned:\t" << total_realigned << endl; cerr << "improved:\t" << total_improved << endl; } }