// ValidateReaders checks that all the readers point to BAM files representing // alignments against the same set of reference sequences, and that the // sequences are identically ordered. If these checks fail the operation of // the multireader is undefined, so we force program exit. void BamMultiReader::ValidateReaders(void) const { int firstRefCount = readers.front().first->GetReferenceCount(); BamTools::RefVector firstRefData = readers.front().first->GetReferenceData(); for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) { BamReader* reader = it->first; BamTools::RefVector currentRefData = reader->GetReferenceData(); BamTools::RefVector::const_iterator f = firstRefData.begin(); BamTools::RefVector::const_iterator c = currentRefData.begin(); if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { cerr << "ERROR: mismatched number of references in " << reader->GetFilename() << " expected " << firstRefCount << " reference sequences but only found " << reader->GetReferenceCount() << endl; exit(1); } // this will be ok; we just checked above that we have identically-sized sets of references // here we simply check if they are all, in fact, equal in content while (f != firstRefData.end()) { if (f->RefName != c->RefName || f->RefLength != c->RefLength) { cerr << "ERROR: mismatched references found in " << reader->GetFilename() << " expected: " << endl; for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) cerr << a->RefName << " " << a->RefLength << endl; cerr << "but found: " << endl; for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) cerr << a->RefName << " " << a->RefLength << endl; exit(1); } ++f; ++c; } } }
NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector) : _maxId(-1) { for (size_t i = 0; i < refVector.size(); ++i) { QuickString chrom = refVector[i].RefName; CHRPOS length = refVector[i].RefLength; _maxId++; _chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId); } }
NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector) : _maxId(-1) { size_t i = 0; for (; i < refVector.size(); ++i) { string chrom = refVector[i].RefName; CHRPOS length = refVector[i].RefLength; _maxId++; _chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId); _chromList.push_back(chrom); } // Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string. // Add in an empty chrom so these don't error. _maxId++; _chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId); _chromList.push_back(""); }
RegionCoverage::RegionCoverage( const BamTools::RefVector& references ) : m_contigList(NULL) , m_bcovRegion(NULL) , m_rcovRegion(NULL) , m_lastRegionAssigned(NULL) , m_numAuxFields(0) , m_ncovDepths(0) { m_numRefContigs = references.size(); if( m_numRefContigs ) { m_contigList = new TargetContig*[m_numRefContigs]; for( size_t i = 0; i < m_numRefContigs; ++i ) { m_contigList[i] = new TargetContig( references[i].RefName, references[i].RefLength ); m_contigIdx[ references[i].RefName ] = i; } } // initial values force set up on first call to 'iterator' m_bcovContigIdx = m_rcovContigIdx = m_numRefContigs; m_bcovRegionPos = 0; }
int bamToLociMeth(std::string bamFile1, std::string bamFile2, const char * outFile, std::string sample, int d, int freq, int methdiff) { std::map <std::string, int> allpatterns; std::map <std::string, int> patternMeth; getAllPatterns(allpatterns, patternMeth); BamTools::BamReader reader1; BamTools::BamReader reader2; bamCheck(bamFile1, reader1); bamCheck(bamFile2, reader2); // count reads number of bam files for normalization. /*int n1=0; int n2=0; BamTools::BamAlignment al1; BamTools::BamAlignment al2; while(reader1.GetNextAlignment(al1)){ n1=n1+1; } while(reader2.GetNextAlignment(al2)){ n2=n2+1; } std::cout << bamFile1 << " read count: " << n1 << ".\n" << bamFile2 << " read count: " << n2 << "." << std::endl; double fnorm1k2=double(n1)/double(n2); */ // get reference name from first bam const BamTools::RefVector refs = reader1.GetReferenceData(); reader1.LocateIndex(); reader2.LocateIndex(); if (reader1.HasIndex() & reader2.HasIndex()) { std::cerr << "Output file: " << outFile << std::endl; std::cerr << "===============================" << std::endl; ogzstream myfile; myfile.open (outFile); header(myfile, allpatterns); for(BamTools::RefVector::const_iterator i = refs.begin(); i != refs.end(); ++i) { std::map<std::string, std::vector<std::string> > lociMeth1; std::map<std::string, std::vector<std::string> > lociMeth2; // iterate by chromosome for bamFile1 and bamFile2 readerToMeth(reader1, reader2, lociMeth1, lociMeth2, i, d, refs, sample); //entropy analysis interLoci(lociMeth1, lociMeth2, allpatterns, patternMeth, freq, methdiff, myfile); } reader1.Close(); reader2.Close(); myfile.close(); finished(); return 0; } else { std::cerr << "Could not load index data for all input BAM file(s)... Aborting." << std::endl; return false; } }
inline int run(Config const& c, TSingleHit) { // Create library objects typedef std::map<std::string, LibraryInfo> TLibraryMap; typedef std::map<std::string, TLibraryMap> TSampleLibrary; TSampleLibrary sampleLib; // Scan libraries for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Get a sample name std::string sampleName(c.files[file_c].stem().string()); // Check that all input bam files exist BamTools::BamReader reader; if ( ! reader.Open(c.files[file_c].string()) ) { std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl; reader.Close(); return -1; } // Check that all input bam files are indexed reader.LocateIndex(); if ( !reader.HasIndex() ) { std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl; reader.Close(); return -1; } // Get library parameters and overall maximum insert size TLibraryMap libInfo; getLibraryParams(c.files[file_c], libInfo, 0, 5); sampleLib.insert(std::make_pair(sampleName, libInfo)); } // Read all SV intervals typedef std::vector<StructuralVariantRecord> TSVs; TSVs svs; std::map<unsigned int, std::string> idToName; unsigned int intervalCount=1; if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) { Memory_mapped_file interval_file(c.int_file.string().c_str()); char interval_buffer[Memory_mapped_file::MAX_LINE_LENGTH]; while (interval_file.left_bytes() > 0) { interval_file.read_line(interval_buffer); // Read single interval line StructuralVariantRecord sv; Tokenizer token(interval_buffer, Memory_mapped_file::MAX_LINE_LENGTH); std::string interval_rname; token.getString(sv.chr); sv.svStart = token.getUInt(); sv.svEnd = token.getUInt() + 1; std::string svName; token.getString(svName); idToName.insert(std::make_pair(intervalCount, svName)); sv.id = intervalCount++; svs.push_back(sv); } interval_file.close(); } else { // Create artificial intervals BamTools::BamReader readerRef; if ( ! readerRef.Open(c.files[0].string()) ) return -1; BamTools::RefVector references = readerRef.GetReferenceData(); typename BamTools::RefVector::const_iterator itRef = references.begin(); for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) { int32_t pos = 0; while (pos < references[refIndex].RefLength) { int32_t window_len = pos+c.window_size; if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength; StructuralVariantRecord sv; sv.chr = references[refIndex].RefName; sv.svStart = pos; sv.svEnd = window_len; std::stringstream s; s << sv.chr << ":" << sv.svStart << "-" << sv.svEnd; idToName.insert(std::make_pair(intervalCount, s.str())); sv.id = intervalCount++; svs.push_back(sv); pos += c.window_offset; } } } // Output data types typedef std::pair<std::string, int> TSampleSVPair; typedef std::pair<int, int> TBpRead; typedef std::map<TSampleSVPair, TBpRead> TCountMap; TCountMap countMap; // Annotate coverage annotateCoverage(c.files, c.minMapQual, c.inclCigar, sampleLib, svs, countMap, TSingleHit()); // Output library statistics std::cout << "Library statistics" << std::endl; TSampleLibrary::const_iterator sampleIt=sampleLib.begin(); for(;sampleIt!=sampleLib.end();++sampleIt) { std::cout << "Sample: " << sampleIt->first << std::endl; TLibraryMap::const_iterator libIt=sampleIt->second.begin(); for(;libIt!=sampleIt->second.end();++libIt) { std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << ",MappedReads=" << libIt->second.mappedReads << ",DuplicatePairs=" << libIt->second.non_unique_pairs << ",UniquePairs=" << libIt->second.unique_pairs << std::endl; } } // Output file boost::iostreams::filtering_ostream dataOut; dataOut.push(boost::iostreams::gzip_compressor()); dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary)); // Iterate all SVs typename TSVs::const_iterator itSV = svs.begin(); typename TSVs::const_iterator itSVEnd = svs.end(); for(;itSV!=itSVEnd;++itSV) { dataOut << itSV->chr << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second; // Iterate all samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Get the sample name std::string sampleName(c.files[file_c].stem().string()); TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id); typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair); dataOut << "\t"; if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t"; if (c.bp_flag) dataOut << countMapIt->second.first << "\t"; dataOut << countMapIt->second.second; } dataOut << std::endl; } // End boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;; return 0; }
inline int run(Config const& c, TCoverageType covType) { // Create library objects typedef boost::unordered_map<std::string, LibraryInfo> TLibraryMap; typedef boost::unordered_map<std::string, TLibraryMap> TSampleLibrary; TSampleLibrary sampleLib; // Scan libraries for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Get a sample name std::string sampleName(c.files[file_c].stem().string()); // Check that all input bam files exist BamTools::BamReader reader; if ( ! reader.Open(c.files[file_c].string()) ) { std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl; reader.Close(); return -1; } // Check that all input bam files are indexed reader.LocateIndex(); if ( !reader.HasIndex() ) { std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl; reader.Close(); return -1; } // Get library parameters and overall maximum insert size TLibraryMap libInfo; getLibraryParams(c.files[file_c], libInfo, 0, 5); sampleLib.insert(std::make_pair(sampleName, libInfo)); } // Get references BamTools::BamReader readerRef; if ( ! readerRef.Open(c.files[0].string()) ) return -1; BamTools::RefVector references = readerRef.GetReferenceData(); // Read all SV intervals typedef std::vector<CovRecord> TSVs; TSVs svs; std::map<unsigned int, std::string> idToName; unsigned int intervalCount=1; if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) { typedef boost::unordered_map<std::string, unsigned int> TMapChr; TMapChr mapChr; typename BamTools::RefVector::const_iterator itRef = references.begin(); for(unsigned int i = 0;itRef!=references.end();++itRef, ++i) mapChr[ itRef->RefName ] = i; std::ifstream interval_file(c.int_file.string().c_str(), std::ifstream::in); if (interval_file.is_open()) { while (interval_file.good()) { std::string intervalLine; getline(interval_file, intervalLine); typedef boost::tokenizer< boost::char_separator<char> > Tokenizer; boost::char_separator<char> sep(" \t,;"); Tokenizer tokens(intervalLine, sep); Tokenizer::iterator tokIter = tokens.begin(); if (tokIter!=tokens.end()) { std::string chrName=*tokIter++; TMapChr::const_iterator mapChrIt = mapChr.find(chrName); if (mapChrIt != mapChr.end()) { if (tokIter!=tokens.end()) { CovRecord sv; sv.chr = mapChrIt->second; sv.svStart = boost::lexical_cast<int32_t>(*tokIter++); sv.svEnd = boost::lexical_cast<int32_t>(*tokIter++) + 1; std::string svName = *tokIter; idToName.insert(std::make_pair(intervalCount, svName)); sv.id = intervalCount++; svs.push_back(sv); } } } } interval_file.close(); } } else { // Create artificial intervals typename BamTools::RefVector::const_iterator itRef = references.begin(); for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) { int32_t pos = 0; unsigned int wSize = c.window_size; unsigned int wOffset = c.window_offset; if (c.window_num>0) { wSize=(itRef->RefLength / c.window_num) + 1; wOffset=wSize; } while (pos < references[refIndex].RefLength) { int32_t window_len = pos+wSize; if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength; CovRecord sv; sv.chr = refIndex; sv.svStart = pos; sv.svEnd = window_len; std::stringstream s; s << references[sv.chr].RefName << ":" << sv.svStart << "-" << sv.svEnd; idToName.insert(std::make_pair(intervalCount, s.str())); sv.id = intervalCount++; svs.push_back(sv); pos += wOffset; } } } // Output data types typedef std::pair<std::string, int> TSampleSVPair; typedef std::pair<int, int> TBpRead; typedef std::map<TSampleSVPair, TBpRead> TCountMap; TCountMap countMap; // Annotate coverage if (c.inclCigar) annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<BpLevelCount>(), covType); else annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<NoBpLevelCount>(), covType); // Output library statistics std::cout << "Library statistics" << std::endl; TSampleLibrary::const_iterator sampleIt=sampleLib.begin(); for(;sampleIt!=sampleLib.end();++sampleIt) { std::cout << "Sample: " << sampleIt->first << std::endl; TLibraryMap::const_iterator libIt=sampleIt->second.begin(); for(;libIt!=sampleIt->second.end();++libIt) { std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << std::endl; } } // Output file boost::iostreams::filtering_ostream dataOut; dataOut.push(boost::iostreams::gzip_compressor()); dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary)); // Print header dataOut << "#chr\tstart\tend\tid"; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { std::string sampleName(c.files[file_c].stem().string()); dataOut << "\t"; if (c.avg_flag) dataOut << sampleName << "_avgcov" << "\t"; if (c.bp_flag) dataOut << sampleName << "_bpcount" << "\t"; if ((c.bp_flag) || (c.avg_flag)) dataOut << sampleName << "_readcount"; else dataOut << sampleName; } dataOut << std::endl; // Iterate all SVs typename TSVs::const_iterator itSV = svs.begin(); typename TSVs::const_iterator itSVEnd = svs.end(); for(;itSV!=itSVEnd;++itSV) { dataOut << references[itSV->chr].RefName << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second; // Iterate all samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Get the sample name std::string sampleName(c.files[file_c].stem().string()); TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id); typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair); dataOut << "\t"; if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t"; if (c.bp_flag) dataOut << countMapIt->second.first << "\t"; dataOut << countMapIt->second.second; } dataOut << std::endl; } // End boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;; return 0; }