Ejemplo n.º 1
void bamParser::parse(Reads & reads, string & filename,
		vector<string> & chrs_to_parse) {
	BamTools::BamReader bam;
	BamTools::BamAlignment read;

	string chr;
	uint64_t readCnt = 0;
	uint32_t meanReadLen = 0;

	if (!(bam.Open(filename))) {
		throw FileNotGood(filename);

	const BamTools::RefVector refvec = bam.GetReferenceData();

	while (bam.GetNextAlignment(read)) {
		chr = getR1Chr(read, refvec);
		if (isGoodRead(read)) {
			if (isChrToParse(chrs_to_parse, chr)) {
				updateAvgReadLength(readCnt, meanReadLen, read);
				insertRead(read, reads, chr);
	position BamAlignmentReader::GetLastPositionInBam(const std::string& bamPath, Region::SharedPtr regionPtr)
		BamTools::BamReader bamReader;
		if (!bamReader.Open(bamPath))
			throw "Unable to open bam file";

		int refID = bamReader.GetReferenceID(regionPtr->getReferenceID());
		auto referenceData = bamReader.GetReferenceData();
		return referenceData[refID].RefLength;
Ejemplo n.º 3
Archivo: cov.cpp Proyecto: AmaliT/delly
inline int
run(Config const& c, TSingleHit)
  // Create library objects
  typedef std::map<std::string, LibraryInfo> TLibraryMap;
  typedef std::map<std::string, TLibraryMap> TSampleLibrary;
  TSampleLibrary sampleLib;

  // Scan libraries
  for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
    // Get a sample name
    std::string sampleName(c.files[file_c].stem().string());

    // Check that all input bam files exist
    BamTools::BamReader reader;
    if ( ! reader.Open(c.files[file_c].string()) ) {
      std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl;
      return -1;
    // Check that all input bam files are indexed
    if ( !reader.HasIndex() ) {
      std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl;
      return -1;

    // Get library parameters and overall maximum insert size
    TLibraryMap libInfo;
    getLibraryParams(c.files[file_c], libInfo, 0, 5);
    sampleLib.insert(std::make_pair(sampleName, libInfo));

  // Read all SV intervals
  typedef std::vector<StructuralVariantRecord> TSVs;
  TSVs svs;
  std::map<unsigned int, std::string> idToName;
  unsigned int intervalCount=1;
  if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) {
    Memory_mapped_file interval_file(c.int_file.string().c_str());
    char interval_buffer[Memory_mapped_file::MAX_LINE_LENGTH];
    while (interval_file.left_bytes() > 0) {
      // Read single interval line
      StructuralVariantRecord sv;
      Tokenizer token(interval_buffer, Memory_mapped_file::MAX_LINE_LENGTH);
      std::string interval_rname;
      sv.svStart = token.getUInt();
      sv.svEnd = token.getUInt() + 1;
      std::string svName;
      idToName.insert(std::make_pair(intervalCount, svName));
      sv.id = intervalCount++;
  } else {
    // Create artificial intervals
    BamTools::BamReader readerRef;
    if ( ! readerRef.Open(c.files[0].string()) ) return -1;
    BamTools::RefVector references = readerRef.GetReferenceData();
    typename BamTools::RefVector::const_iterator itRef = references.begin();
    for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) {
      int32_t pos = 0;
      while (pos < references[refIndex].RefLength) {
	int32_t window_len = pos+c.window_size;
	if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength;
	StructuralVariantRecord sv;
	sv.chr = references[refIndex].RefName;
	sv.svStart = pos;
	sv.svEnd = window_len;
	std::stringstream s; 
	s << sv.chr << ":" << sv.svStart << "-" << sv.svEnd;
	idToName.insert(std::make_pair(intervalCount, s.str()));
	sv.id = intervalCount++;
	pos += c.window_offset;

  // Output data types
  typedef std::pair<std::string, int> TSampleSVPair;
  typedef std::pair<int, int> TBpRead;
  typedef std::map<TSampleSVPair, TBpRead> TCountMap;
  TCountMap countMap;

  // Annotate coverage
  annotateCoverage(c.files, c.minMapQual, c.inclCigar, sampleLib, svs, countMap, TSingleHit());

  // Output library statistics
  std::cout << "Library statistics" << std::endl;
  TSampleLibrary::const_iterator sampleIt=sampleLib.begin();
  for(;sampleIt!=sampleLib.end();++sampleIt) {
    std::cout << "Sample: " << sampleIt->first << std::endl;
    TLibraryMap::const_iterator libIt=sampleIt->second.begin();
    for(;libIt!=sampleIt->second.end();++libIt) {
      std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << ",MappedReads=" << libIt->second.mappedReads << ",DuplicatePairs=" << libIt->second.non_unique_pairs << ",UniquePairs=" << libIt->second.unique_pairs << std::endl;

  // Output file
  boost::iostreams::filtering_ostream dataOut;
  dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary));

  // Iterate all SVs
  typename TSVs::const_iterator itSV = svs.begin();
  typename TSVs::const_iterator itSVEnd = svs.end();
  for(;itSV!=itSVEnd;++itSV) {
    dataOut << itSV->chr << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second;
    // Iterate all samples
    for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
      // Get the sample name
      std::string sampleName(c.files[file_c].stem().string());
      TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id);
      typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair);
      dataOut << "\t";
      if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t";
      if (c.bp_flag) dataOut << countMapIt->second.first << "\t";
      dataOut << countMapIt->second.second;
    dataOut << std::endl;

  // End
  boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
  std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;;
  return 0;
Ejemplo n.º 4
// Main
int filterBAMMain(int argc, char** argv)
    parseFilterBAMOptions(argc, argv);

    // Read the graph if distance-filtering mode is enabled
    StringGraph* pGraph = NULL;
        pGraph = SGUtil::loadASQG(opt::asqgFile, 0, false);

    // Read the BWTs if depth-filtering mode is enabled
    BWT* pBWT = NULL;
    BWT* pRBWT = NULL;
        pBWT = new BWT(opt::fmIndexPrefix + BWT_EXT, opt::sampleRate);
        pRBWT = new BWT(opt::fmIndexPrefix + RBWT_EXT, opt::sampleRate);

    Timer* pTimer = new Timer(PROGRAM_IDENT);    

    int numPairsTotal = 0;
    int numPairsFilteredByDistance = 0;
    int numPairsFilteredByER = 0;
    int numPairsFilteredByQuality = 0;
    int numPairsFilteredByDepth = 0;
    int numPairsUnmapped = 0;
    int numPairsWrote = 0;

    // Open the bam files for reading/writing
    BamTools::BamReader* pBamReader = new BamTools::BamReader;

    BamTools::BamWriter* pBamWriter = new BamTools::BamWriter;
    pBamWriter->Open(opt::outFile, pBamReader->GetHeaderText(), pBamReader->GetReferenceData());
    const BamTools::RefVector& referenceVector = pBamReader->GetReferenceData();

    BamTools::BamAlignment record1;
    BamTools::BamAlignment record2;
    bool done = false;

        if(numPairsTotal++ % 200000 == 0)
            printf("[sga filterBAM] Processed %d pairs\n", numPairsTotal);

        done = !readAlignmentPair(pBamReader, record1, record2);

        if(!record1.IsMapped() || !record2.IsMapped())
            numPairsUnmapped += 1;

        // Ensure the pairing is correct
        if(record1.Name != record2.Name)
            std::cout << "NAME FAIL: " << record1.Name << " " << record2.Name << "\n";
        assert(record1.Name == record2.Name);
        bool bPassedFilters = true;

        // Check if the error rate is below the max
        double er1 = getErrorRate(record1);
        double er2 = getErrorRate(record2);

        if(er1 > opt::maxError || er2 > opt::maxError)
            bPassedFilters = false;
            numPairsFilteredByER += 1;

        if(record1.MapQuality < opt::minQuality || record2.MapQuality < opt::minQuality)
            bPassedFilters = false;
            numPairsFilteredByQuality += 1;

        // Perform depth check for pairs aligning to different contigs
        if(bPassedFilters && (pBWT != NULL && pRBWT != NULL && opt::maxKmerDepth > 0) && (record1.RefID != record2.RefID))
            int maxDepth1 = getMaxKmerDepth(record1.QueryBases, pBWT, pRBWT);
            int maxDepth2 = getMaxKmerDepth(record1.QueryBases, pBWT, pRBWT);
            if(maxDepth1 > opt::maxKmerDepth || maxDepth2 > opt::maxKmerDepth)
                bPassedFilters = false;
                numPairsFilteredByDepth += 1;

        // Perform short-insert pair check
        if(pGraph != NULL)
            bPassedFilters = bPassedFilters && filterByGraph(pGraph, referenceVector, record1, record2);
            numPairsFilteredByDistance += 1;
            numPairsWrote += 1;

    std::cout << "Total pairs: " << numPairsTotal << "\n";
    std::cout << "Total pairs output: " << numPairsWrote << "\n";
    std::cout << "Total filtered because one pair is unmapped: " << numPairsUnmapped << "\n";
    std::cout << "Total filtered by distance: " << numPairsFilteredByDistance << "\n";
    std::cout << "Total filtered by error rate: " << numPairsFilteredByER << "\n";
    std::cout << "Total filtered by quality: " << numPairsFilteredByQuality << "\n";
    std::cout << "Total filtered by depth: " << numPairsFilteredByDepth << "\n";
    if(pGraph != NULL)
        delete pGraph;

    if(pBWT != NULL)
        delete pBWT;

    if(pRBWT != NULL)
        delete pRBWT;


    delete pTimer;
    delete pBamReader;
    delete pBamWriter;
    return 0;
Ejemplo n.º 5
inline int
run(Config const& c, TCoverageType covType)
  // Create library objects
  typedef boost::unordered_map<std::string, LibraryInfo> TLibraryMap;
  typedef boost::unordered_map<std::string, TLibraryMap> TSampleLibrary;
  TSampleLibrary sampleLib;

  // Scan libraries
  for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
    // Get a sample name
    std::string sampleName(c.files[file_c].stem().string());

    // Check that all input bam files exist
    BamTools::BamReader reader;
    if ( ! reader.Open(c.files[file_c].string()) ) {
      std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl;
      return -1;
    // Check that all input bam files are indexed
    if ( !reader.HasIndex() ) {
      std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl;
      return -1;

    // Get library parameters and overall maximum insert size
    TLibraryMap libInfo;
    getLibraryParams(c.files[file_c], libInfo, 0, 5);
    sampleLib.insert(std::make_pair(sampleName, libInfo));

  // Get references
  BamTools::BamReader readerRef;
  if ( ! readerRef.Open(c.files[0].string()) ) return -1;
  BamTools::RefVector references = readerRef.GetReferenceData();

  // Read all SV intervals
  typedef std::vector<CovRecord> TSVs;
  TSVs svs;
  std::map<unsigned int, std::string> idToName;
  unsigned int intervalCount=1;
  if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) {
    typedef boost::unordered_map<std::string, unsigned int> TMapChr;
    TMapChr mapChr;
    typename BamTools::RefVector::const_iterator itRef = references.begin();
    for(unsigned int i = 0;itRef!=references.end();++itRef, ++i) mapChr[ itRef->RefName ] = i;
    std::ifstream interval_file(c.int_file.string().c_str(), std::ifstream::in);
    if (interval_file.is_open()) {
      while (interval_file.good()) {
	std::string intervalLine;
	getline(interval_file, intervalLine);
	typedef boost::tokenizer< boost::char_separator<char> > Tokenizer;
	boost::char_separator<char> sep(" \t,;");
	Tokenizer tokens(intervalLine, sep);
	Tokenizer::iterator tokIter = tokens.begin();
	if (tokIter!=tokens.end()) {
	  std::string chrName=*tokIter++;
	  TMapChr::const_iterator mapChrIt = mapChr.find(chrName);
	  if (mapChrIt != mapChr.end()) {
	    if (tokIter!=tokens.end()) {
	      CovRecord sv;	  
	      sv.chr = mapChrIt->second;
	      sv.svStart = boost::lexical_cast<int32_t>(*tokIter++);
	      sv.svEnd = boost::lexical_cast<int32_t>(*tokIter++) + 1;
	      std::string svName = *tokIter;
	      idToName.insert(std::make_pair(intervalCount, svName));
	      sv.id = intervalCount++;
  } else {
    // Create artificial intervals
    typename BamTools::RefVector::const_iterator itRef = references.begin();
    for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) {
      int32_t pos = 0;
      unsigned int wSize = c.window_size;
      unsigned int wOffset = c.window_offset;
      if (c.window_num>0) {
	wSize=(itRef->RefLength / c.window_num) + 1;
      while (pos < references[refIndex].RefLength) {
	int32_t window_len = pos+wSize;
	if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength;
	CovRecord sv;
	sv.chr = refIndex;
	sv.svStart = pos;
	sv.svEnd = window_len;
	std::stringstream s; 
	s << references[sv.chr].RefName << ":" << sv.svStart << "-" << sv.svEnd;
	idToName.insert(std::make_pair(intervalCount, s.str()));
	sv.id = intervalCount++;
	pos += wOffset;

  // Output data types
  typedef std::pair<std::string, int> TSampleSVPair;
  typedef std::pair<int, int> TBpRead;
  typedef std::map<TSampleSVPair, TBpRead> TCountMap;
  TCountMap countMap;

  // Annotate coverage
  if (c.inclCigar) annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<BpLevelCount>(), covType);
  else annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<NoBpLevelCount>(), covType);

  // Output library statistics
  std::cout << "Library statistics" << std::endl;
  TSampleLibrary::const_iterator sampleIt=sampleLib.begin();
  for(;sampleIt!=sampleLib.end();++sampleIt) {
    std::cout << "Sample: " << sampleIt->first << std::endl;
    TLibraryMap::const_iterator libIt=sampleIt->second.begin();
    for(;libIt!=sampleIt->second.end();++libIt) {
      std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << std::endl;

  // Output file
  boost::iostreams::filtering_ostream dataOut;
  dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary));

  // Print header
  dataOut << "#chr\tstart\tend\tid";
  for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
    std::string sampleName(c.files[file_c].stem().string());
    dataOut << "\t";
    if (c.avg_flag) dataOut << sampleName << "_avgcov" << "\t";
    if (c.bp_flag) dataOut << sampleName << "_bpcount" << "\t";
    if ((c.bp_flag) || (c.avg_flag)) dataOut << sampleName << "_readcount";
    else dataOut << sampleName;
  dataOut << std::endl;

  // Iterate all SVs
  typename TSVs::const_iterator itSV = svs.begin();
  typename TSVs::const_iterator itSVEnd = svs.end();
  for(;itSV!=itSVEnd;++itSV) {
    dataOut << references[itSV->chr].RefName << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second;
    // Iterate all samples
    for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
      // Get the sample name
      std::string sampleName(c.files[file_c].stem().string());
      TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id);
      typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair);
      dataOut << "\t";
      if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t";
      if (c.bp_flag) dataOut << countMapIt->second.first << "\t";
      dataOut << countMapIt->second.second;
    dataOut << std::endl;

  // End
  boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
  std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;;
  return 0;