// ValidateReaders checks that all the readers point to BAM files representing
// alignments against the same set of reference sequences, and that the
// sequences are identically ordered.  If these checks fail the operation of
// the multireader is undefined, so we force program exit.
void BamMultiReader::ValidateReaders(void) const {
    int firstRefCount = readers.front().first->GetReferenceCount();
    BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
    for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
        BamReader* reader = it->first;
        BamTools::RefVector currentRefData = reader->GetReferenceData();
        BamTools::RefVector::const_iterator f = firstRefData.begin();
        BamTools::RefVector::const_iterator c = currentRefData.begin();
        if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
            cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
                      << " expected " << firstRefCount 
                      << " reference sequences but only found " << reader->GetReferenceCount() << endl;
            exit(1);
        }
        // this will be ok; we just checked above that we have identically-sized sets of references
        // here we simply check if they are all, in fact, equal in content
        while (f != firstRefData.end()) {
            if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
                cerr << "ERROR: mismatched references found in " << reader->GetFilename()
                          << " expected: " << endl;
                for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
                    cerr << a->RefName << " " << a->RefLength << endl;
                cerr << "but found: " << endl;
                for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
                    cerr << a->RefName << " " << a->RefLength << endl;
                exit(1);
            }
            ++f; ++c;
        }
    }
}
Exemple #2
0
NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector)
: _maxId(-1)
{
    for (size_t i = 0; i < refVector.size(); ++i) {
        QuickString chrom = refVector[i].RefName;
        CHRPOS length = refVector[i].RefLength;
        _maxId++;
        _chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId);
    }
}
Exemple #3
0
NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector)
: _maxId(-1)
{
	size_t i = 0;
    for (; i < refVector.size(); ++i) {
        string chrom = refVector[i].RefName;
        CHRPOS length = refVector[i].RefLength;
        _maxId++;
        _chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId);
		_chromList.push_back(chrom);
    }
	// Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string.
	// Add in an empty chrom so these don't error.
	_maxId++;
	_chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId);
	_chromList.push_back("");

}
Exemple #4
0
RegionCoverage::RegionCoverage( const BamTools::RefVector& references )
	: m_contigList(NULL)
	, m_bcovRegion(NULL)
	, m_rcovRegion(NULL)
    , m_lastRegionAssigned(NULL)
	, m_numAuxFields(0)
	, m_ncovDepths(0)
{
	m_numRefContigs = references.size();
	if( m_numRefContigs ) {
		m_contigList = new TargetContig*[m_numRefContigs];
		for( size_t i = 0; i < m_numRefContigs; ++i ) {
			m_contigList[i] = new TargetContig( references[i].RefName, references[i].RefLength );
			m_contigIdx[ references[i].RefName ] = i;
		}
	}
	// initial values force set up on first call to 'iterator'
	m_bcovContigIdx = m_rcovContigIdx = m_numRefContigs;
	m_bcovRegionPos = 0;
}
Exemple #5
0
int bamToLociMeth(std::string bamFile1, 
                  std::string bamFile2, 
                  const char * outFile, 
                  std::string sample, 
                  int d, int freq, int methdiff)
{
    std::map <std::string, int> allpatterns;
    std::map <std::string, int> patternMeth;
    getAllPatterns(allpatterns, patternMeth);
    
    BamTools::BamReader reader1;
    BamTools::BamReader reader2;
    bamCheck(bamFile1, reader1);
    bamCheck(bamFile2, reader2);
    
	// count reads number of bam files for normalization. 
	/*int n1=0;
	int n2=0;
	BamTools::BamAlignment al1;
	BamTools::BamAlignment al2;
	while(reader1.GetNextAlignment(al1)){
        n1=n1+1;
    }
	while(reader2.GetNextAlignment(al2)){
        n2=n2+1;
    }
	std::cout << bamFile1 << " read count: " << n1 << ".\n" << bamFile2 << " read count: " << n2 << "." << std::endl;
	double fnorm1k2=double(n1)/double(n2);
	*/
    // get reference name from first bam
    const BamTools::RefVector refs = reader1.GetReferenceData();
    reader1.LocateIndex();
    reader2.LocateIndex();

    if (reader1.HasIndex() & reader2.HasIndex())
    {
        std::cerr << "Output file: " << outFile << std::endl;
        std::cerr << "===============================" << std::endl;

        ogzstream myfile;
        myfile.open (outFile);
        header(myfile, allpatterns);
        
        for(BamTools::RefVector::const_iterator i = refs.begin(); i != refs.end(); ++i)
        {
            std::map<std::string, std::vector<std::string> > lociMeth1;
            std::map<std::string, std::vector<std::string> > lociMeth2;
            // iterate by chromosome for bamFile1 and bamFile2
            readerToMeth(reader1, reader2, lociMeth1, lociMeth2, i, d, refs, sample);
            //entropy analysis
            interLoci(lociMeth1, lociMeth2, allpatterns, patternMeth, freq, methdiff, myfile);
        }
        reader1.Close();
        reader2.Close();
        myfile.close();
        finished();
        return 0;
    } else {
        std::cerr << "Could not load index data for all input BAM file(s)... Aborting." << std::endl;
        return false;
    }

}
Exemple #6
0
inline int
run(Config const& c, TSingleHit)
{
  // Create library objects
  typedef std::map<std::string, LibraryInfo> TLibraryMap;
  typedef std::map<std::string, TLibraryMap> TSampleLibrary;
  TSampleLibrary sampleLib;

  // Scan libraries
  for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
    // Get a sample name
    std::string sampleName(c.files[file_c].stem().string());

    // Check that all input bam files exist
    BamTools::BamReader reader;
    if ( ! reader.Open(c.files[file_c].string()) ) {
      std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl;
      reader.Close();
      return -1;
    }
    
    // Check that all input bam files are indexed
    reader.LocateIndex();
    if ( !reader.HasIndex() ) {
      std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl;
      reader.Close();
      return -1;
    }

    // Get library parameters and overall maximum insert size
    TLibraryMap libInfo;
    getLibraryParams(c.files[file_c], libInfo, 0, 5);
    sampleLib.insert(std::make_pair(sampleName, libInfo));
  }

  // Read all SV intervals
  typedef std::vector<StructuralVariantRecord> TSVs;
  TSVs svs;
  std::map<unsigned int, std::string> idToName;
  unsigned int intervalCount=1;
  if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) {
    Memory_mapped_file interval_file(c.int_file.string().c_str());
    char interval_buffer[Memory_mapped_file::MAX_LINE_LENGTH];
    while (interval_file.left_bytes() > 0) {
      interval_file.read_line(interval_buffer);
      // Read single interval line
      StructuralVariantRecord sv;
      Tokenizer token(interval_buffer, Memory_mapped_file::MAX_LINE_LENGTH);
      std::string interval_rname;
      token.getString(sv.chr);
      sv.svStart = token.getUInt();
      sv.svEnd = token.getUInt() + 1;
      std::string svName;
      token.getString(svName);
      idToName.insert(std::make_pair(intervalCount, svName));
      sv.id = intervalCount++;
      svs.push_back(sv);
    }
    interval_file.close();
  } else {
    // Create artificial intervals
    BamTools::BamReader readerRef;
    if ( ! readerRef.Open(c.files[0].string()) ) return -1;
    BamTools::RefVector references = readerRef.GetReferenceData();
    typename BamTools::RefVector::const_iterator itRef = references.begin();
    for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) {
      int32_t pos = 0;
      while (pos < references[refIndex].RefLength) {
	int32_t window_len = pos+c.window_size;
	if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength;
	StructuralVariantRecord sv;
	sv.chr = references[refIndex].RefName;
	sv.svStart = pos;
	sv.svEnd = window_len;
	std::stringstream s; 
	s << sv.chr << ":" << sv.svStart << "-" << sv.svEnd;
	idToName.insert(std::make_pair(intervalCount, s.str()));
	sv.id = intervalCount++;
	svs.push_back(sv);
	pos += c.window_offset;
      }
    }
  }

  // Output data types
  typedef std::pair<std::string, int> TSampleSVPair;
  typedef std::pair<int, int> TBpRead;
  typedef std::map<TSampleSVPair, TBpRead> TCountMap;
  TCountMap countMap;

  // Annotate coverage
  annotateCoverage(c.files, c.minMapQual, c.inclCigar, sampleLib, svs, countMap, TSingleHit());

  // Output library statistics
  std::cout << "Library statistics" << std::endl;
  TSampleLibrary::const_iterator sampleIt=sampleLib.begin();
  for(;sampleIt!=sampleLib.end();++sampleIt) {
    std::cout << "Sample: " << sampleIt->first << std::endl;
    TLibraryMap::const_iterator libIt=sampleIt->second.begin();
    for(;libIt!=sampleIt->second.end();++libIt) {
      std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << ",MappedReads=" << libIt->second.mappedReads << ",DuplicatePairs=" << libIt->second.non_unique_pairs << ",UniquePairs=" << libIt->second.unique_pairs << std::endl;
    }
  }

  // Output file
  boost::iostreams::filtering_ostream dataOut;
  dataOut.push(boost::iostreams::gzip_compressor());
  dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary));

  // Iterate all SVs
  typename TSVs::const_iterator itSV = svs.begin();
  typename TSVs::const_iterator itSVEnd = svs.end();
  for(;itSV!=itSVEnd;++itSV) {
    dataOut << itSV->chr << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second;
    // Iterate all samples
    for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
      // Get the sample name
      std::string sampleName(c.files[file_c].stem().string());
      TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id);
      typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair);
      dataOut << "\t";
      if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t";
      if (c.bp_flag) dataOut << countMapIt->second.first << "\t";
      dataOut << countMapIt->second.second;
    }
    dataOut << std::endl;
  }

  // End
  boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
  std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;;
  return 0;
}
Exemple #7
0
inline int
run(Config const& c, TCoverageType covType)
{
  // Create library objects
  typedef boost::unordered_map<std::string, LibraryInfo> TLibraryMap;
  typedef boost::unordered_map<std::string, TLibraryMap> TSampleLibrary;
  TSampleLibrary sampleLib;

  // Scan libraries
  for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
    // Get a sample name
    std::string sampleName(c.files[file_c].stem().string());

    // Check that all input bam files exist
    BamTools::BamReader reader;
    if ( ! reader.Open(c.files[file_c].string()) ) {
      std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl;
      reader.Close();
      return -1;
    }
    
    // Check that all input bam files are indexed
    reader.LocateIndex();
    if ( !reader.HasIndex() ) {
      std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl;
      reader.Close();
      return -1;
    }

    // Get library parameters and overall maximum insert size
    TLibraryMap libInfo;
    getLibraryParams(c.files[file_c], libInfo, 0, 5);
    sampleLib.insert(std::make_pair(sampleName, libInfo));
  }

  // Get references
  BamTools::BamReader readerRef;
  if ( ! readerRef.Open(c.files[0].string()) ) return -1;
  BamTools::RefVector references = readerRef.GetReferenceData();

  // Read all SV intervals
  typedef std::vector<CovRecord> TSVs;
  TSVs svs;
  std::map<unsigned int, std::string> idToName;
  unsigned int intervalCount=1;
  if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) {
    typedef boost::unordered_map<std::string, unsigned int> TMapChr;
    TMapChr mapChr;
    typename BamTools::RefVector::const_iterator itRef = references.begin();
    for(unsigned int i = 0;itRef!=references.end();++itRef, ++i) mapChr[ itRef->RefName ] = i;
    std::ifstream interval_file(c.int_file.string().c_str(), std::ifstream::in);
    if (interval_file.is_open()) {
      while (interval_file.good()) {
	std::string intervalLine;
	getline(interval_file, intervalLine);
	typedef boost::tokenizer< boost::char_separator<char> > Tokenizer;
	boost::char_separator<char> sep(" \t,;");
	Tokenizer tokens(intervalLine, sep);
	Tokenizer::iterator tokIter = tokens.begin();
	if (tokIter!=tokens.end()) {
	  std::string chrName=*tokIter++;
	  TMapChr::const_iterator mapChrIt = mapChr.find(chrName);
	  if (mapChrIt != mapChr.end()) {
	    if (tokIter!=tokens.end()) {
	      CovRecord sv;	  
	      sv.chr = mapChrIt->second;
	      sv.svStart = boost::lexical_cast<int32_t>(*tokIter++);
	      sv.svEnd = boost::lexical_cast<int32_t>(*tokIter++) + 1;
	      std::string svName = *tokIter;
	      idToName.insert(std::make_pair(intervalCount, svName));
	      sv.id = intervalCount++;
	      svs.push_back(sv);
	    }
	  }
	}
      }
      interval_file.close();
    }
  } else {
    // Create artificial intervals
    typename BamTools::RefVector::const_iterator itRef = references.begin();
    for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) {
      int32_t pos = 0;
      unsigned int wSize = c.window_size;
      unsigned int wOffset = c.window_offset;
      if (c.window_num>0) {
	wSize=(itRef->RefLength / c.window_num) + 1;
	wOffset=wSize;
      }
      while (pos < references[refIndex].RefLength) {
	int32_t window_len = pos+wSize;
	if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength;
	CovRecord sv;
	sv.chr = refIndex;
	sv.svStart = pos;
	sv.svEnd = window_len;
	std::stringstream s; 
	s << references[sv.chr].RefName << ":" << sv.svStart << "-" << sv.svEnd;
	idToName.insert(std::make_pair(intervalCount, s.str()));
	sv.id = intervalCount++;
	svs.push_back(sv);
	pos += wOffset;
      }
    }
  }

  // Output data types
  typedef std::pair<std::string, int> TSampleSVPair;
  typedef std::pair<int, int> TBpRead;
  typedef std::map<TSampleSVPair, TBpRead> TCountMap;
  TCountMap countMap;

  // Annotate coverage
  if (c.inclCigar) annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<BpLevelCount>(), covType);
  else annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<NoBpLevelCount>(), covType);

  // Output library statistics
  std::cout << "Library statistics" << std::endl;
  TSampleLibrary::const_iterator sampleIt=sampleLib.begin();
  for(;sampleIt!=sampleLib.end();++sampleIt) {
    std::cout << "Sample: " << sampleIt->first << std::endl;
    TLibraryMap::const_iterator libIt=sampleIt->second.begin();
    for(;libIt!=sampleIt->second.end();++libIt) {
      std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << std::endl;
    }
  }

  // Output file
  boost::iostreams::filtering_ostream dataOut;
  dataOut.push(boost::iostreams::gzip_compressor());
  dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary));

  // Print header
  dataOut << "#chr\tstart\tend\tid";
  for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
    std::string sampleName(c.files[file_c].stem().string());
    dataOut << "\t";
    if (c.avg_flag) dataOut << sampleName << "_avgcov" << "\t";
    if (c.bp_flag) dataOut << sampleName << "_bpcount" << "\t";
    if ((c.bp_flag) || (c.avg_flag)) dataOut << sampleName << "_readcount";
    else dataOut << sampleName;
  }
  dataOut << std::endl;

  // Iterate all SVs
  typename TSVs::const_iterator itSV = svs.begin();
  typename TSVs::const_iterator itSVEnd = svs.end();
  for(;itSV!=itSVEnd;++itSV) {
    dataOut << references[itSV->chr].RefName << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second;
    // Iterate all samples
    for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
      // Get the sample name
      std::string sampleName(c.files[file_c].stem().string());
      TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id);
      typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair);
      dataOut << "\t";
      if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t";
      if (c.bp_flag) dataOut << countMapIt->second.first << "\t";
      dataOut << countMapIt->second.second;
    }
    dataOut << std::endl;
  }

  // End
  boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
  std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;;
  return 0;
}