void bamParser::parse(Reads & reads, string & filename, vector<string> & chrs_to_parse) { BamTools::BamReader bam; BamTools::BamAlignment read; string chr; uint64_t readCnt = 0; uint32_t meanReadLen = 0; if (!(bam.Open(filename))) { throw FileNotGood(filename); } const BamTools::RefVector refvec = bam.GetReferenceData(); while (bam.GetNextAlignment(read)) { chr = getR1Chr(read, refvec); if (isGoodRead(read)) { if (isChrToParse(chrs_to_parse, chr)) { updateAvgReadLength(readCnt, meanReadLen, read); insertRead(read, reads, chr); } } } reads.setReadlength(meanReadLen); }
std::string getQuickStats(const std::string &bamFile, std::map< std::string, int > &keyLen, unsigned int &nFlowFZ, unsigned int &nFlowZM) { std::string errMsg = ""; BamTools::BamReader bamReader; if(!bamReader.Open(bamFile)) { errMsg += "Failed to open bam " + bamFile + "\n"; return(errMsg); } BamTools::SamHeader samHeader = bamReader.GetHeader(); for (BamTools::SamReadGroupIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr ) { if(itr->HasID()) keyLen[itr->ID] = itr->HasKeySequence() ? itr->KeySequence.length() : 0; if(itr->HasFlowOrder()) nFlowZM = std::max(nFlowZM,(unsigned int) itr->FlowOrder.length()); } BamTools::BamAlignment alignment; std::vector<uint16_t> flowIntFZ; while(bamReader.GetNextAlignment(alignment)) { if(alignment.GetTag("FZ", flowIntFZ)) nFlowFZ = flowIntFZ.size(); break; } bamReader.Close(); if(nFlowFZ==0) std::cout << "NOTE: bam file has no flow signals in FZ tag: " + bamFile + "\n"; if(nFlowZM==0) std::cout << "NOTE: bam file has no flow signals in ZM tag: " + bamFile + "\n"; return(errMsg); }
int bamCheck(std::string bamFile, BamTools::BamReader & reader) { if (!reader.Open(bamFile)){ //bam file std::cerr << "Could not open input BAM file." << std::endl; reader.Close(); return false; } else { time_t now = time(0); std::cerr << "*******************************\n" << ctime(&now) << "************Entropy************\n" << "Opening " << bamFile <<std::endl; std::cerr << "===============================" << std::endl; return 0; } }
int readerToMeth(BamTools::BamReader & reader1, BamTools::BamReader & reader2, std::map<std::string, std::vector<std::string> > & lociMeth1, std::map<std::string, std::vector<std::string> > & lociMeth2, BamTools::RefVector::const_iterator i, int d, const BamTools::RefVector refs, std::string sample) { const int r1=reader1.GetReferenceID(i->RefName); const int r2=reader2.GetReferenceID(i->RefName); const int rl=i->RefLength; if(reader1.SetRegion(r1,0, r1, rl) & reader2.SetRegion(r2,0, r2, rl)) { std::cerr << "Processing " << i->RefName << std::endl; BamTools::BamAlignment al; while (reader1.GetNextAlignment(al)){ //std::cout << al.RefID << std::endl; byread(al, d, lociMeth1, refs, sample); } while (reader2.GetNextAlignment(al)){ //std::cout << al.RefID << std::endl; byread(al, d, lociMeth2, refs, sample); } } reader1.Rewind(); reader2.Rewind(); return 0; }
void OpenMyBam(BamTools::BamReader &bamReader, char *bamFile){ if (!bamReader.Open(bamFile)) { cerr << " ERROR: fail to open bam" << bamFile << endl; //exit(-1); throw exception instead } //find the index string bamIndex(bamFile); // replace last character to find index bamIndex.append(".bai"); // bam->bai if (!bamReader.OpenIndex(bamIndex)) { cerr << "ERROR: fail to open bam index " << bamIndex << endl; // throw exception } }
Batch::RunStatus PairedEndBatch::parseAlignmentFile(void) { // open reader on new BAM alignment file BamTools::BamReader reader; if ( !reader.Open(m_generatedBam) ) { m_errorString = "could not open generated BAM file: "; m_errorString.append(m_generatedBam); m_errorString.append(" to parse alignments"); return Batch::Error; } // set up data containers m_result.ReadLengths.reserve(2 * m_settings->BatchSize); m_result.FragmentLengths.reserve(m_settings->BatchSize); // plow through alignments BamTools::BamAlignment mate1; BamTools::BamAlignment mate2; while ( reader.GetNextAlignmentCore(mate1) ) { // store mate1 read length, regardless of aligned state m_result.ReadLengths.push_back(mate1.Length); // read mate2 if ( reader.GetNextAlignmentCore(mate2) ) { // store mate2 read length, regardless of aligned state m_result.ReadLengths.push_back(mate2.Length); // if both mates mapped to same reference if ( mate1.IsMapped() && mate2.IsMapped() && (mate1.RefID == mate2.RefID) ) { // calculate & store fragment length m_result.FragmentLengths.push_back( calculateFragmentLength(mate1, mate2) ); } } } reader.Close(); // remove extreme outliers removeOutliers(m_result.FragmentLengths); removeOutliers(m_result.ReadLengths); // if we get here, all should be OK return Batch::Normal; }
void BamHeaderHelper::GetRefID(BamTools::BamReader &bamReader) { BamTools::SamHeader samHeader = bamReader.GetHeader(); for (BamTools::SamSequenceIterator itr = samHeader.Sequences.Begin(); itr != samHeader.Sequences.End(); ++itr) { string bamseq = itr->Name; bam_sequence_names.push_back(bamseq); } }
std::vector< Sample::SharedPtr > BamAlignmentReader::GetBamReaderSamples(const std::string& bamPath) { std::vector< Sample::SharedPtr > samplePtrs; BamTools::BamReader bamReader; if (!bamReader.Open(bamPath)) { throw "Unable to open bam file"; } auto readGroups = bamReader.GetHeader().ReadGroups; auto iter = readGroups.Begin(); for (; iter != readGroups.End(); ++iter) { auto samplePtr = std::make_shared< Sample >((*iter).Sample, (*iter).ID, bamPath); samplePtrs.emplace_back(samplePtr); } bamReader.Close(); return samplePtrs; }
uint32_t BamAlignmentReader::GetReadLength(const std::string& bamPath) { uint32_t bamReadLength = 300; BamTools::BamReader bamReader; if (!bamReader.Open(bamPath)) { throw "Unable to open bam file"; } BamTools::BamAlignment bamAlignment; while(bamReader.GetNextAlignment(bamAlignment)) { if (bamAlignment.IsPrimaryAlignment()) { bamReadLength = bamAlignment.QueryBases.size(); break; } } bamReader.Close(); return bamReadLength; }
bool getNextAlignment(BamTools::BamAlignment &alignment, BamTools::BamReader &bamReader, const std::map<std::string, int> &groupID, std::vector< BamTools::BamAlignment > &alignmentSample, std::map<std::string, int> &wellIndex, unsigned int nSample) { if(nSample > 0) { // We are randomly sampling, so next read should come from the sample that was already taken from the bam file if(alignmentSample.size() > 0) { alignment = alignmentSample.back(); alignmentSample.pop_back(); alignment.BuildCharData(); return(true); } else { return(false); } } else { // No random sampling, so we're either returning everything or we're looking for specific read names bool storeRead = false; while(bamReader.GetNextAlignment(alignment)) { if(groupID.size() > 0) { std::string thisReadGroupID = ""; if( !alignment.GetTag("RG", thisReadGroupID) || (groupID.find(thisReadGroupID)==groupID.end()) ); continue; } storeRead=true; if(wellIndex.size() > 0) { // We are filtering by position, so check if we should skip or keep the read int thisCol,thisRow; if(1 != ion_readname_to_rowcol(alignment.Name.c_str(), &thisRow, &thisCol)) std::cerr << "Error parsing read name: " << alignment.Name << "\n"; std::stringstream wellIdStream; wellIdStream << thisCol << ":" << thisRow; std::map<std::string, int>::iterator wellIndexIter; wellIndexIter = wellIndex.find(wellIdStream.str()); if(wellIndexIter != wellIndex.end()) { // If the read ID matches we should keep, unless its a duplicate if(wellIndexIter->second >= 0) { storeRead=true; wellIndexIter->second=-1; } else { storeRead=false; std::cerr << "WARNING: found extra instance of readID " << wellIdStream.str() << ", keeping only first\n"; } } else { // read ID is not one we should keep storeRead=false; } } if(storeRead) break; } return(storeRead); } }
void BamHeaderHelper::GetFlowOrder(BamTools::BamReader &bamReader){ BamTools::SamHeader samHeader = bamReader.GetHeader(); if (!samHeader.HasReadGroups()) { //bamReader.Close(); cerr << "ERROR: there is no read group in " << "this file" << endl; //exit(1); } for (BamTools::SamReadGroupIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr) { if (itr->HasFlowOrder()) { flow_order_set.push_back(itr->FlowOrder); //flowKey = itr->KeySequence; } } }
position BamAlignmentReader::GetLastPositionInBam(const std::string& bamPath, Region::SharedPtr regionPtr) { BamTools::BamReader bamReader; if (!bamReader.Open(bamPath)) { throw "Unable to open bam file"; } bamReader.LocateIndex(); int refID = bamReader.GetReferenceID(regionPtr->getReferenceID()); auto referenceData = bamReader.GetReferenceData(); bamReader.Close(); return referenceData[refID].RefLength; }
void MyBamGroup::ReadGroup(char *bamFile){ BamTools::BamReader bamReader; if(!bamReader.Open(std::string(bamFile))) { errMsg = "Failed to open bam " + std::string(bamFile) + "\n"; } else { BamTools::SamHeader samHeader = bamReader.GetHeader(); for (BamTools::SamReadGroupIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr ) { if(itr->HasID()) { ID.push_back(itr->ID); } else { ID.push_back(""); } if(itr->HasFlowOrder()) { FlowOrder.push_back(itr->FlowOrder); } else { FlowOrder.push_back(""); } if(itr->HasKeySequence()) { KeySequence.push_back(itr->KeySequence); } else { KeySequence.push_back(""); } if(itr->HasDescription()) { Description.push_back(itr->Description); } else { Description.push_back(""); } if(itr->HasLibrary()) { Library.push_back(itr->Library); } else { Library.push_back(""); } if(itr->HasPlatformUnit()) { PlatformUnit.push_back(itr->PlatformUnit); } else { PlatformUnit.push_back(""); } if(itr->HasPredictedInsertSize()) { PredictedInsertSize.push_back(itr->PredictedInsertSize); } else { PredictedInsertSize.push_back(""); } if(itr->HasProductionDate()) { ProductionDate.push_back(itr->ProductionDate); } else { ProductionDate.push_back(""); } if(itr->HasProgram()) { Program.push_back(itr->Program); } else { Program.push_back(""); } if(itr->HasSample()) { Sample.push_back(itr->Sample); } else { Sample.push_back(""); } if(itr->HasSequencingCenter()) { SequencingCenter.push_back(itr->SequencingCenter); } else { SequencingCenter.push_back(""); } if(itr->HasSequencingTechnology()) { SequencingTechnology.push_back(itr->SequencingTechnology); } else { SequencingTechnology.push_back(""); } } bamReader.Close(); } }
inline int run(Config const& c, TCoverageType covType) { // Create library objects typedef boost::unordered_map<std::string, LibraryInfo> TLibraryMap; typedef boost::unordered_map<std::string, TLibraryMap> TSampleLibrary; TSampleLibrary sampleLib; // Scan libraries for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Get a sample name std::string sampleName(c.files[file_c].stem().string()); // Check that all input bam files exist BamTools::BamReader reader; if ( ! reader.Open(c.files[file_c].string()) ) { std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl; reader.Close(); return -1; } // Check that all input bam files are indexed reader.LocateIndex(); if ( !reader.HasIndex() ) { std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl; reader.Close(); return -1; } // Get library parameters and overall maximum insert size TLibraryMap libInfo; getLibraryParams(c.files[file_c], libInfo, 0, 5); sampleLib.insert(std::make_pair(sampleName, libInfo)); } // Get references BamTools::BamReader readerRef; if ( ! readerRef.Open(c.files[0].string()) ) return -1; BamTools::RefVector references = readerRef.GetReferenceData(); // Read all SV intervals typedef std::vector<CovRecord> TSVs; TSVs svs; std::map<unsigned int, std::string> idToName; unsigned int intervalCount=1; if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) { typedef boost::unordered_map<std::string, unsigned int> TMapChr; TMapChr mapChr; typename BamTools::RefVector::const_iterator itRef = references.begin(); for(unsigned int i = 0;itRef!=references.end();++itRef, ++i) mapChr[ itRef->RefName ] = i; std::ifstream interval_file(c.int_file.string().c_str(), std::ifstream::in); if (interval_file.is_open()) { while (interval_file.good()) { std::string intervalLine; getline(interval_file, intervalLine); typedef boost::tokenizer< boost::char_separator<char> > Tokenizer; boost::char_separator<char> sep(" \t,;"); Tokenizer tokens(intervalLine, sep); Tokenizer::iterator tokIter = tokens.begin(); if (tokIter!=tokens.end()) { std::string chrName=*tokIter++; TMapChr::const_iterator mapChrIt = mapChr.find(chrName); if (mapChrIt != mapChr.end()) { if (tokIter!=tokens.end()) { CovRecord sv; sv.chr = mapChrIt->second; sv.svStart = boost::lexical_cast<int32_t>(*tokIter++); sv.svEnd = boost::lexical_cast<int32_t>(*tokIter++) + 1; std::string svName = *tokIter; idToName.insert(std::make_pair(intervalCount, svName)); sv.id = intervalCount++; svs.push_back(sv); } } } } interval_file.close(); } } else { // Create artificial intervals typename BamTools::RefVector::const_iterator itRef = references.begin(); for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) { int32_t pos = 0; unsigned int wSize = c.window_size; unsigned int wOffset = c.window_offset; if (c.window_num>0) { wSize=(itRef->RefLength / c.window_num) + 1; wOffset=wSize; } while (pos < references[refIndex].RefLength) { int32_t window_len = pos+wSize; if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength; CovRecord sv; sv.chr = refIndex; sv.svStart = pos; sv.svEnd = window_len; std::stringstream s; s << references[sv.chr].RefName << ":" << sv.svStart << "-" << sv.svEnd; idToName.insert(std::make_pair(intervalCount, s.str())); sv.id = intervalCount++; svs.push_back(sv); pos += wOffset; } } } // Output data types typedef std::pair<std::string, int> TSampleSVPair; typedef std::pair<int, int> TBpRead; typedef std::map<TSampleSVPair, TBpRead> TCountMap; TCountMap countMap; // Annotate coverage if (c.inclCigar) annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<BpLevelCount>(), covType); else annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<NoBpLevelCount>(), covType); // Output library statistics std::cout << "Library statistics" << std::endl; TSampleLibrary::const_iterator sampleIt=sampleLib.begin(); for(;sampleIt!=sampleLib.end();++sampleIt) { std::cout << "Sample: " << sampleIt->first << std::endl; TLibraryMap::const_iterator libIt=sampleIt->second.begin(); for(;libIt!=sampleIt->second.end();++libIt) { std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << std::endl; } } // Output file boost::iostreams::filtering_ostream dataOut; dataOut.push(boost::iostreams::gzip_compressor()); dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary)); // Print header dataOut << "#chr\tstart\tend\tid"; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { std::string sampleName(c.files[file_c].stem().string()); dataOut << "\t"; if (c.avg_flag) dataOut << sampleName << "_avgcov" << "\t"; if (c.bp_flag) dataOut << sampleName << "_bpcount" << "\t"; if ((c.bp_flag) || (c.avg_flag)) dataOut << sampleName << "_readcount"; else dataOut << sampleName; } dataOut << std::endl; // Iterate all SVs typename TSVs::const_iterator itSV = svs.begin(); typename TSVs::const_iterator itSVEnd = svs.end(); for(;itSV!=itSVEnd;++itSV) { dataOut << references[itSV->chr].RefName << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second; // Iterate all samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Get the sample name std::string sampleName(c.files[file_c].stem().string()); TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id); typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair); dataOut << "\t"; if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t"; if (c.bp_flag) dataOut << countMapIt->second.first << "\t"; dataOut << countMapIt->second.second; } dataOut << std::endl; } // End boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;; return 0; }
// // Main // int filterBAMMain(int argc, char** argv) { parseFilterBAMOptions(argc, argv); // Read the graph if distance-filtering mode is enabled StringGraph* pGraph = NULL; if(!opt::asqgFile.empty()) pGraph = SGUtil::loadASQG(opt::asqgFile, 0, false); // Read the BWTs if depth-filtering mode is enabled BWT* pBWT = NULL; BWT* pRBWT = NULL; if(!opt::fmIndexPrefix.empty()) { pBWT = new BWT(opt::fmIndexPrefix + BWT_EXT, opt::sampleRate); pRBWT = new BWT(opt::fmIndexPrefix + RBWT_EXT, opt::sampleRate); } Timer* pTimer = new Timer(PROGRAM_IDENT); // int numPairsTotal = 0; int numPairsFilteredByDistance = 0; int numPairsFilteredByER = 0; int numPairsFilteredByQuality = 0; int numPairsFilteredByDepth = 0; int numPairsUnmapped = 0; int numPairsWrote = 0; // Open the bam files for reading/writing BamTools::BamReader* pBamReader = new BamTools::BamReader; pBamReader->Open(opt::bamFile); BamTools::BamWriter* pBamWriter = new BamTools::BamWriter; pBamWriter->Open(opt::outFile, pBamReader->GetHeaderText(), pBamReader->GetReferenceData()); const BamTools::RefVector& referenceVector = pBamReader->GetReferenceData(); BamTools::BamAlignment record1; BamTools::BamAlignment record2; bool done = false; while(!done) { if(numPairsTotal++ % 200000 == 0) printf("[sga filterBAM] Processed %d pairs\n", numPairsTotal); done = !readAlignmentPair(pBamReader, record1, record2); if(done) break; if(!record1.IsMapped() || !record2.IsMapped()) { numPairsUnmapped += 1; continue; } // Ensure the pairing is correct if(record1.Name != record2.Name) { std::cout << "NAME FAIL: " << record1.Name << " " << record2.Name << "\n"; } assert(record1.Name == record2.Name); bool bPassedFilters = true; // Check if the error rate is below the max double er1 = getErrorRate(record1); double er2 = getErrorRate(record2); if(er1 > opt::maxError || er2 > opt::maxError) { bPassedFilters = false; numPairsFilteredByER += 1; } if(record1.MapQuality < opt::minQuality || record2.MapQuality < opt::minQuality) { bPassedFilters = false; numPairsFilteredByQuality += 1; } // Perform depth check for pairs aligning to different contigs if(bPassedFilters && (pBWT != NULL && pRBWT != NULL && opt::maxKmerDepth > 0) && (record1.RefID != record2.RefID)) { int maxDepth1 = getMaxKmerDepth(record1.QueryBases, pBWT, pRBWT); int maxDepth2 = getMaxKmerDepth(record1.QueryBases, pBWT, pRBWT); if(maxDepth1 > opt::maxKmerDepth || maxDepth2 > opt::maxKmerDepth) { bPassedFilters = false; numPairsFilteredByDepth += 1; } } // Perform short-insert pair check if(pGraph != NULL) { bPassedFilters = bPassedFilters && filterByGraph(pGraph, referenceVector, record1, record2); numPairsFilteredByDistance += 1; } if(bPassedFilters) { pBamWriter->SaveAlignment(record1); pBamWriter->SaveAlignment(record2); numPairsWrote += 1; } } std::cout << "Total pairs: " << numPairsTotal << "\n"; std::cout << "Total pairs output: " << numPairsWrote << "\n"; std::cout << "Total filtered because one pair is unmapped: " << numPairsUnmapped << "\n"; std::cout << "Total filtered by distance: " << numPairsFilteredByDistance << "\n"; std::cout << "Total filtered by error rate: " << numPairsFilteredByER << "\n"; std::cout << "Total filtered by quality: " << numPairsFilteredByQuality << "\n"; std::cout << "Total filtered by depth: " << numPairsFilteredByDepth << "\n"; if(pGraph != NULL) delete pGraph; if(pBWT != NULL) delete pBWT; if(pRBWT != NULL) delete pRBWT; pBamWriter->Close(); pBamReader->Close(); delete pTimer; delete pBamReader; delete pBamWriter; return 0; }
void Config::InitializationClustering() { struct stat st; if(stat(Workspace.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Workspace directory already present"); else if (mkdir(Workspace.c_str(), 0755) != 0) { Log("[Error] Could not create workspace directory: " + Workspace); exit(1); } RunningTasksFile = Workspace + "/" + FilePrefix + "running.tasks"; StatsFile = Workspace + "/" + FilePrefix + "stats"; BinClusterFile = Workspace + "/" + FilePrefix + "bpc"; clusterFile = new ClusterFile(BinClusterFile); clusterDir = Workspace + "/clusters/"; if(stat(clusterDir.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Cluster directory already present"); else if (mkdir(clusterDir.c_str(), 0755) != 0) { Log("[Error] Could not create cluster directory: " + clusterDir); exit(1); } insertsizeDir = Workspace + "/insertsize/"; if(stat(insertsizeDir.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Insertsize directory already present"); else if (mkdir(insertsizeDir.c_str(), 0755) != 0) { Log("[Error] Could not create insertsize directory: " + insertsizeDir); exit(1); } coverageDir = Workspace + "/coverage/"; if(stat(coverageDir.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Coverage directory already present"); else if (mkdir(coverageDir.c_str(), 0755) != 0) { Log("[Error] Could not create coverage directory: " + coverageDir); exit(1); } if (!ForwardBam.empty() && !ReverseBam.empty() && PairedBam.empty()) { UsePairedBam = false; } else if (ForwardBam.empty() && ReverseBam.empty() && !PairedBam.empty()) { UsePairedBam = true; } else { Log("[Error] No correct bam file(s)"); exit(1); } BamTools::BamAlignment alignment; BamTools::BamReader BamReader; if (UsePairedBam) { BamReader.Open(PairedBam); if (not BamReader.IsOpen()) { Log("[Error] Could not open paired bam"); exit(1); } if (PairedIndex.empty()) { if (not BamReader.LocateIndex(BamTools::BamIndex::STANDARD)) { PairedIndex = PairedBam.substr(0,PairedBam.find_last_of(".bam")-3) + ".bai"; BamReader.OpenIndex(PairedIndex); } if (not BamReader.HasIndex()) { Log("[Error] No index for bamfile"); exit(1); } } BamTools::SamHeader header = BamReader.GetHeader(); for (BamTools::SamReadGroupIterator it = header.ReadGroups.Begin(); it != header.ReadGroups.End(); it++) { BamTools::SamReadGroup* readgroup = &*it; readNameConverter.TrimName(readgroup->ID); readNameConverter.AddReadGroup(readgroup->ID); } long int count = 0; while (BamReader.GetNextAlignment(alignment)) { string RG; if (alignment.GetTag("RG", RG)) { if (not NameTrim.empty()) readNameConverter.TrimName(RG); if (readNameConverter.AddReadGroup(RG)) { Log("[Warning] Readgroup '" + RG + "' found in reads but not in header"); count = 0; } } count++; if (count > 10000) break; } BamReader.Close(); } else { BamReader.Open(ForwardBam); if (not BamReader.IsOpen()) { Log("[Error] Could not open first/forward bam"); exit(1); } if (ForwardIndex.empty()) { if (not BamReader.LocateIndex(BamTools::BamIndex::STANDARD)) { ForwardIndex = ForwardBam.substr(0,ForwardBam.find_last_of(".bam")-3) + ".bai"; BamReader.OpenIndex(ForwardIndex); } if (not BamReader.HasIndex()) { Log("[Error] No index for forward bamfile"); exit(1); } } BamTools::SamHeader forwardheader = BamReader.GetHeader(); for (BamTools::SamReadGroupIterator it = forwardheader.ReadGroups.Begin(); it != forwardheader.ReadGroups.End(); it++) { BamTools::SamReadGroup* readgroup = &*it; readNameConverter.TrimName(readgroup->ID); readNameConverter.AddReadGroup(readgroup->ID); } long int count = 0; while (BamReader.GetNextAlignment(alignment)) { string RG; if (alignment.GetTag("RG", RG)) { if (!NameTrim.empty()) readNameConverter.TrimName(RG); if (readNameConverter.AddReadGroup(RG)) { Log("[Warning] Readgroup '" + RG + "' found in forward reads but not in header"); count = 0; } } count++; if (count > 10000) break; } BamReader.Close(); BamReader.Open(ReverseBam); if (not BamReader.IsOpen()) { Log("[Error] Could not open second/reverse bam"); exit(1); } if (ReverseIndex.empty()) { if (not BamReader.LocateIndex(BamTools::BamIndex::STANDARD)) { ReverseIndex = ReverseBam.substr(0,ReverseBam.find_last_of(".bam")-3) + ".bai"; BamReader.OpenIndex(ReverseIndex); } if (not BamReader.HasIndex()) { Log("[Error] No index for reverse bamfile"); exit(1); } } BamTools::SamHeader reverseheader = BamReader.GetHeader(); for (BamTools::SamReadGroupIterator it = reverseheader.ReadGroups.Begin(); it != reverseheader.ReadGroups.End(); it++) { BamTools::SamReadGroup* readgroup = &*it; readNameConverter.TrimName(readgroup->ID); if (readNameConverter.AddReadGroup(readgroup->ID)) { Log("[Warning] Readgroup '" + readgroup->ID + "' found in reverse but not in forward"); } } count = 0; while (BamReader.GetNextAlignment(alignment)) { string RG; if (alignment.GetTag("RG", RG)) { if (!NameTrim.empty()) readNameConverter.TrimName(RG); if (readNameConverter.AddReadGroup(RG)) { Log("[Warning] Readgroup '" + RG + "' found in reverse reads but not in header"); count = 0; } } count++; if (count > 10000) break; } BamReader.Close(); } for(map<string, int>::iterator it = readNameConverter.ReadGroups.begin(); it!=readNameConverter.ReadGroups.end(); ++it) { ostringstream logBuffer; logBuffer << "Readgroup found: " << it->second << " - " << it->first; Log(logBuffer.str()); } writeConfigFile(Workspace + FilePrefix + "config"); }
// // Main // int somaticVariantFiltersMain(int argc, char** argv) { parseSomaticVariantFiltersOptions(argc, argv); Timer* pTimer = new Timer(PROGRAM_IDENT); // Load Reference ReadTable refTable(opt::referenceFile, SRF_NO_VALIDATION); refTable.indexReadsByID(); // Load BAMs BamTools::BamReader* pTumorBamReader = new BamTools::BamReader; pTumorBamReader->Open(opt::tumorBamFile); pTumorBamReader->LocateIndex(); assert(pTumorBamReader->HasIndex()); BamTools::BamReader* pNormalBamReader = new BamTools::BamReader; pNormalBamReader->Open(opt::normalBamFile); pNormalBamReader->LocateIndex(); assert(pNormalBamReader->HasIndex()); // Track duplicated variants HashSet<std::string> duplicateHash; std::ifstream input(opt::vcfFile.c_str()); std::string line; while(getline(input, line)) { if(line.empty()) continue; if(line[0] == '#') { std::cout << line << "\n"; continue; } // parse record VCFRecord record(line); if(record.isMultiAllelic()) { std::cerr << "Error: multi-allelic VCF found, please run vcfbreakmulti\n"; exit(EXIT_FAILURE); } // Check if we've seen this variant already std::string key = makeVariantKey(record); if(duplicateHash.find(key) != duplicateHash.end()) continue; else duplicateHash.insert(key); if(opt::verbose > 0) { std::stringstream ss; ss << "Variant: " << record << "\n"; fprintf(stderr, "===============================================\n%s", ss.str().c_str()); } StringStringHash tagHash; makeTagHash(record, tagHash); StringVector fail_reasons; int hplen = 0; if(!getTagValue(tagHash, "HPLen", hplen)) hplen = calculateHomopolymerLength(record, &refTable); if(hplen > opt::maxHPLen) fail_reasons.push_back("Homopolymer"); double dust = 0.0f; if(!getTagValue(tagHash, "Dust", dust)) dust = HapgenUtil::calculateDustScoreAtPosition(record.refName, record.refPosition, &refTable); if(dust > opt::maxDust) fail_reasons.push_back("LowComplexity"); double af; if(getTagValue(tagHash, "AF", af) && af < opt::minAF) fail_reasons.push_back("LowAlleleFrequency"); int varDP; if(getTagValue(tagHash, "VarDP", varDP) && varDP < opt::minVarDP) fail_reasons.push_back("LowVarDP"); double strandBias; if(getTagValue(tagHash, "SB", strandBias) && strandBias >= opt::maxStrandBias) fail_reasons.push_back("StrandBias"); CoverageStats tumor_stats = getVariantCoverage(pTumorBamReader, record, &refTable); CoverageStats normal_stats = getVariantCoverage(pNormalBamReader, record, &refTable); if(opt::verbose > 0) { fprintf(stderr, "Tumor: [%zu %zu]\n", tumor_stats.n_total_reads, tumor_stats.n_evidence_reads); fprintf(stderr, "Normal: [%zu %zu]\n", normal_stats.n_total_reads, normal_stats.n_evidence_reads); } if(normal_stats.n_evidence_reads > opt::maxNormalReads) fail_reasons.push_back("NormalEvidence"); if(normal_stats.n_total_reads < opt::minNormalDepth) fail_reasons.push_back("LowNormalDepth"); if(!tumor_stats.snv_evidence_quals.empty()) { double median_quality = median(tumor_stats.snv_evidence_quals); if(median_quality < opt::minMedianQuality) fail_reasons.push_back("LowQuality"); } if(tumor_stats.median_mapping_quality < opt::minMedianQuality) fail_reasons.push_back("LowMappingQuality"); if(!fail_reasons.empty()) { if(record.passStr != "PASS" && record.passStr != ".") fail_reasons.insert(fail_reasons.begin(), record.passStr); std::stringstream strss; std::copy(fail_reasons.begin(), fail_reasons.end(), std::ostream_iterator<std::string>(strss, ";")); record.passStr = strss.str(); record.passStr.erase(record.passStr.size() - 1); // erase trailing ; } std::cout << record << "\n"; } // Cleanup delete pTumorBamReader; delete pNormalBamReader; delete pTimer; return 0; }
int scanBam() { std::vector< std::map<std::string, ScanResults> > resultlist; // std::ostream* pWriter; // pWriter = &std::cout; bool isExome = opt::exomebedfile.size()==0? false: true; std::cout << opt::bamlist << "\n"; std::cout << opt::bamlist.size() << " BAMs" << std::endl; for(std::size_t i=0; i<opt::bamlist.size(); i++) { // storing results for each read group (RG tag). use // read group ID as key. std::map<std::string, ScanResults> resultmap; // store where the overlap was last found in the case of exome seq std::map<std::string, std::vector<range>::iterator> lastfound; std::vector<range>::iterator searchhint; std::cerr << "Start analysing BAM " << opt::bamlist[i] << "\n"; // Open the bam files for reading/writing BamTools::BamReader* pBamReader = new BamTools::BamReader; pBamReader->Open(opt::bamlist[i]); // get bam headers const BamTools::SamHeader header = pBamReader ->GetHeader(); // for(BamTools::SamSequenceConstIterator it = header.Sequences.Begin(); // it != header.Sequences.End();++it){ // std::cout << "Assembly ID:" << it->AssemblyID << ", Name:" << it->Name << std::endl; // } // exit(0); bool rggroups=false; if(opt::ignorerg){ // ignore read groups std::cerr << "Treat all reads in BAM as if they were from a same sample" << std::endl; ScanResults results; results.sample = opt::unknown; resultmap[opt::unknown]=results; }else{ std::map <std::string, std::string> readgroups; std::map <std::string, std::string> readlibs; rggroups = header.HasReadGroups(); if(rggroups){ for(BamTools::SamReadGroupConstIterator it = header.ReadGroups.Begin(); it != header.ReadGroups.End();++it){ readgroups[it->ID]= it->Sample; if(it->HasLibrary()){ readlibs[it->ID] = it -> Library; }else{ readlibs[it->ID] = opt::unknown; } } std::cerr<<"Specified BAM has "<< readgroups.size()<< " read groups" << std::endl; for(std::map<std::string, std::string>::iterator it = readgroups.begin(); it != readgroups.end(); ++it){ ScanResults results; std::string rgid = it -> first; results.sample = it -> second; results.lib = readlibs[rgid]; resultmap[rgid]=results; //results are identified by RG tag. } }else{ std::cerr << "Warning: can't find RG tag in the BAM header" << std::endl; std::cerr << "Warning: treat all reads in BAM as if they were from a same sample" << std::endl; ScanResults results; results.sample = opt::unknown; results.lib = opt::unknown; resultmap[opt::unknown]=results; } } BamTools::BamAlignment record1; bool done = false; int nprocessed=0; // number of reads analyzed int ntotal=0; // number of reads scanned in bam (we skip some reads, see below) while(!done) { ntotal ++; done = !pBamReader -> GetNextAlignment(record1); std::string tag = opt::unknown; if(rggroups){ // skip reads that do not have read group tag if(record1.HasTag("RG")){ record1.GetTag("RG", tag); // std::cerr << c << " reads:{" << record1.QueryBases << "} tag:{" << tag << "}\n"; }else{ std::cerr << "can't find RG tag for read at position {" << record1.RefID << ":" << record1.Position << "}" << std::endl; std::cerr << "skip this read" << std::endl; continue; } } // skip reads with readgroup not defined in BAM header if(resultmap.find(tag) == resultmap.end()){ std::cerr << "RG tag {" << tag << "} for read at position "; std::cerr << "{" << record1.RefID << ":" << record1.Position << "} doesn't exist in BAM header."; continue; } // for exome, exclude reads mapped to the exome regions. if(isExome){ range rg; rg.first = record1.Position; rg.second = record1.Position + record1.Length; std::string chrm = refID2Name(record1.RefID); if(chrm != "-1"){ // check if overlap exome when the read is mapped to chr1-22, X, Y // std::cerr << "read: " << chrm << " " << rg << "\n" << std::endl; std::map<std::string, std::vector<range> >::iterator chrmit = opt::exomebed.find(chrm); if(chrmit == opt::exomebed.end()) { // std::cerr<<"chromosome or reference sequence: " << chrm << " is not present in the specified exome bed file." <<std::endl; // std::cerr<<"please check sequence name encoding, i.e. for chromosome one, is it chr1 or 1" << std::endl; // unmapped reads can have chr names as a star (*). We also don't consider MT reads. resultmap[tag].n_exreadsChrUnmatched +=1; }else{ std::vector<range>::iterator itend = opt::exomebed[chrm].end(); std::map<std::string, std::vector<range>::iterator>::iterator lastfoundchrmit = lastfound.find(chrm); if(lastfoundchrmit == lastfound.end()){ // first entry to this chrm lastfound[chrm] = chrmit->second.begin();// start from begining } // set the hint to where the previous found is searchhint = lastfound[chrm]; std::vector<range>::iterator itsearch = searchRange(searchhint, itend, rg); // if found if(itsearch != itend){// if found searchhint = itsearch; resultmap[tag].n_exreadsExcluded +=1; lastfound[chrm] = searchhint; // update search hint continue; } } } } resultmap[tag].numTotal +=1; if(record1.IsMapped()) { resultmap[tag].numMapped += 1; } if(record1.IsDuplicate()){ resultmap[tag].numDuplicates +=1; } double gc = calcGC(record1.QueryBases); int ptn_count = countMotif(record1.QueryBases, opt::PATTERN, opt::PATTERN_REV); // when the read length exceeds 100bp, number of patterns might exceed the boundary if (ptn_count > ScanParameters::TEL_MOTIF_N-1){ continue; } resultmap[tag].telcounts[ptn_count]+=1; if(gc >= ScanParameters::GC_LOWERBOUND && gc <= ScanParameters::GC_UPPERBOUND){ // get index for GC bin. int idx = floor((gc-ScanParameters::GC_LOWERBOUND)/ScanParameters::GC_BINSIZE); assert(idx >=0 && idx <= ScanParameters::GC_BIN_N-1); // std::cerr << c << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC idx{" << idx << "}\n"; if(idx > ScanParameters::GC_BIN_N-1){ std::cerr << nprocessed << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC bin index out of bound:" << idx << "\n"; exit(EXIT_FAILURE); } resultmap[tag].gccounts[idx]+=1; } // if(resultmap[tag].n_exreadsChrUnmatched > 1000){ // std::cerr<<"too many reads found with unmatched chromosome ID between BAM and exome BED. \n" << std::endl; // } nprocessed++; if( nprocessed%10000000 == 0){ std::cerr << "[scan] processed " << nprocessed << " reads \n" ; } } pBamReader->Close(); delete pBamReader; // consider each BAM separately resultlist.push_back(resultmap); std::cerr << "[scan] total reads in BAM scanned " << ntotal << std::endl; std::cerr << "Completed scanning BAM\n"; } if(opt::onebam){ merge_results_by_readgroup(resultlist); } outputresults(resultlist); if(isExome){ printlog(resultlist); } std::cerr << "Completed writing results\n"; return 0; }
inline int run(Config const& c, TSingleHit) { // Create library objects typedef std::map<std::string, LibraryInfo> TLibraryMap; typedef std::map<std::string, TLibraryMap> TSampleLibrary; TSampleLibrary sampleLib; // Scan libraries for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Get a sample name std::string sampleName(c.files[file_c].stem().string()); // Check that all input bam files exist BamTools::BamReader reader; if ( ! reader.Open(c.files[file_c].string()) ) { std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl; reader.Close(); return -1; } // Check that all input bam files are indexed reader.LocateIndex(); if ( !reader.HasIndex() ) { std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl; reader.Close(); return -1; } // Get library parameters and overall maximum insert size TLibraryMap libInfo; getLibraryParams(c.files[file_c], libInfo, 0, 5); sampleLib.insert(std::make_pair(sampleName, libInfo)); } // Read all SV intervals typedef std::vector<StructuralVariantRecord> TSVs; TSVs svs; std::map<unsigned int, std::string> idToName; unsigned int intervalCount=1; if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) { Memory_mapped_file interval_file(c.int_file.string().c_str()); char interval_buffer[Memory_mapped_file::MAX_LINE_LENGTH]; while (interval_file.left_bytes() > 0) { interval_file.read_line(interval_buffer); // Read single interval line StructuralVariantRecord sv; Tokenizer token(interval_buffer, Memory_mapped_file::MAX_LINE_LENGTH); std::string interval_rname; token.getString(sv.chr); sv.svStart = token.getUInt(); sv.svEnd = token.getUInt() + 1; std::string svName; token.getString(svName); idToName.insert(std::make_pair(intervalCount, svName)); sv.id = intervalCount++; svs.push_back(sv); } interval_file.close(); } else { // Create artificial intervals BamTools::BamReader readerRef; if ( ! readerRef.Open(c.files[0].string()) ) return -1; BamTools::RefVector references = readerRef.GetReferenceData(); typename BamTools::RefVector::const_iterator itRef = references.begin(); for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) { int32_t pos = 0; while (pos < references[refIndex].RefLength) { int32_t window_len = pos+c.window_size; if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength; StructuralVariantRecord sv; sv.chr = references[refIndex].RefName; sv.svStart = pos; sv.svEnd = window_len; std::stringstream s; s << sv.chr << ":" << sv.svStart << "-" << sv.svEnd; idToName.insert(std::make_pair(intervalCount, s.str())); sv.id = intervalCount++; svs.push_back(sv); pos += c.window_offset; } } } // Output data types typedef std::pair<std::string, int> TSampleSVPair; typedef std::pair<int, int> TBpRead; typedef std::map<TSampleSVPair, TBpRead> TCountMap; TCountMap countMap; // Annotate coverage annotateCoverage(c.files, c.minMapQual, c.inclCigar, sampleLib, svs, countMap, TSingleHit()); // Output library statistics std::cout << "Library statistics" << std::endl; TSampleLibrary::const_iterator sampleIt=sampleLib.begin(); for(;sampleIt!=sampleLib.end();++sampleIt) { std::cout << "Sample: " << sampleIt->first << std::endl; TLibraryMap::const_iterator libIt=sampleIt->second.begin(); for(;libIt!=sampleIt->second.end();++libIt) { std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << ",MappedReads=" << libIt->second.mappedReads << ",DuplicatePairs=" << libIt->second.non_unique_pairs << ",UniquePairs=" << libIt->second.unique_pairs << std::endl; } } // Output file boost::iostreams::filtering_ostream dataOut; dataOut.push(boost::iostreams::gzip_compressor()); dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary)); // Iterate all SVs typename TSVs::const_iterator itSV = svs.begin(); typename TSVs::const_iterator itSVEnd = svs.end(); for(;itSV!=itSVEnd;++itSV) { dataOut << itSV->chr << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second; // Iterate all samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Get the sample name std::string sampleName(c.files[file_c].stem().string()); TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id); typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair); dataOut << "\t"; if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t"; if (c.bp_flag) dataOut << countMapIt->second.first << "\t"; dataOut << countMapIt->second.second; } dataOut << std::endl; } // End boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;; return 0; }