Beispiel #1
0
std::string getQuickStats(const std::string &bamFile, std::map< std::string, int > &keyLen, unsigned int &nFlowFZ, unsigned int &nFlowZM) {
	std::string errMsg = "";
	BamTools::BamReader bamReader;
	if(!bamReader.Open(bamFile)) {
		errMsg += "Failed to open bam " + bamFile + "\n";
		return(errMsg);
	}
	BamTools::SamHeader samHeader = bamReader.GetHeader();
	for (BamTools::SamReadGroupIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr ) {
		if(itr->HasID())
			keyLen[itr->ID] = itr->HasKeySequence() ? itr->KeySequence.length() : 0;
		if(itr->HasFlowOrder())
			nFlowZM = std::max(nFlowZM,(unsigned int) itr->FlowOrder.length());
	}
	BamTools::BamAlignment alignment;
	std::vector<uint16_t> flowIntFZ;
	while(bamReader.GetNextAlignment(alignment)) {
		if(alignment.GetTag("FZ", flowIntFZ))
			nFlowFZ = flowIntFZ.size();
		break;
	}
	bamReader.Close();
	if(nFlowFZ==0)
		std::cout << "NOTE: bam file has no flow signals in FZ tag: " + bamFile + "\n";
	if(nFlowZM==0)
		std::cout << "NOTE: bam file has no flow signals in ZM tag: " + bamFile + "\n";
	return(errMsg);
}
Beispiel #2
0
int bamCheck(std::string bamFile, BamTools::BamReader & reader)
{
    if (!reader.Open(bamFile)){ //bam file
        std::cerr << "Could not open input BAM file." << std::endl;
        reader.Close();
        return false;
    } else {
        time_t now = time(0);
        std::cerr << "*******************************\n" << ctime(&now) << "************Entropy************\n" << "Opening " << bamFile <<std::endl;
        std::cerr << "===============================" << std::endl;
        return 0;
    }
    
}
	position BamAlignmentReader::GetLastPositionInBam(const std::string& bamPath, Region::SharedPtr regionPtr)
	{
		BamTools::BamReader bamReader;
		if (!bamReader.Open(bamPath))
		{
			throw "Unable to open bam file";
		}

		bamReader.LocateIndex();
		int refID = bamReader.GetReferenceID(regionPtr->getReferenceID());
		auto referenceData = bamReader.GetReferenceData();
		bamReader.Close();
		return referenceData[refID].RefLength;
	}
Beispiel #4
0
Batch::RunStatus PairedEndBatch::parseAlignmentFile(void) {

    // open reader on new BAM alignment file
    BamTools::BamReader reader;
    if ( !reader.Open(m_generatedBam) ) {
        m_errorString = "could not open generated BAM file: ";
        m_errorString.append(m_generatedBam);
        m_errorString.append(" to parse alignments");
        return Batch::Error;
    }

    // set up data containers
    m_result.ReadLengths.reserve(2 * m_settings->BatchSize);
    m_result.FragmentLengths.reserve(m_settings->BatchSize);

    // plow through alignments
    BamTools::BamAlignment mate1;
    BamTools::BamAlignment mate2;
    while ( reader.GetNextAlignmentCore(mate1) ) {

        // store mate1 read length, regardless of aligned state
        m_result.ReadLengths.push_back(mate1.Length);

        // read mate2
        if ( reader.GetNextAlignmentCore(mate2) ) {

            // store mate2 read length, regardless of aligned state
            m_result.ReadLengths.push_back(mate2.Length);

            // if both mates mapped to same reference
            if ( mate1.IsMapped() &&
                 mate2.IsMapped() &&
                 (mate1.RefID == mate2.RefID) )
            {
                // calculate & store fragment length
                m_result.FragmentLengths.push_back( calculateFragmentLength(mate1, mate2) );
            }
        }
    }
    reader.Close();

    // remove extreme outliers
    removeOutliers(m_result.FragmentLengths);
    removeOutliers(m_result.ReadLengths);

    // if we get here, all should be OK
    return Batch::Normal;
}
	std::vector< Sample::SharedPtr > BamAlignmentReader::GetBamReaderSamples(const std::string& bamPath)
	{
		std::vector< Sample::SharedPtr > samplePtrs;
		BamTools::BamReader bamReader;
		if (!bamReader.Open(bamPath))
		{
			throw "Unable to open bam file";
		}
		auto readGroups = bamReader.GetHeader().ReadGroups;
		auto iter = readGroups.Begin();
		for (; iter != readGroups.End(); ++iter)
		{
			auto samplePtr = std::make_shared< Sample >((*iter).Sample, (*iter).ID, bamPath);
			samplePtrs.emplace_back(samplePtr);
		}
		bamReader.Close();
		return samplePtrs;
	}
	uint32_t BamAlignmentReader::GetReadLength(const std::string& bamPath)
	{
		uint32_t bamReadLength = 300;
		BamTools::BamReader bamReader;
		if (!bamReader.Open(bamPath))
		{
			throw "Unable to open bam file";
		}
		BamTools::BamAlignment bamAlignment;
		while(bamReader.GetNextAlignment(bamAlignment))
		{
			if (bamAlignment.IsPrimaryAlignment())
			{
				bamReadLength = bamAlignment.QueryBases.size();
				break;
			}
		}
		bamReader.Close();
		return bamReadLength;
	}
Beispiel #7
0
void Config::InitializationClustering() {
    struct stat st;
    if(stat(Workspace.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Workspace directory already present");
    else if (mkdir(Workspace.c_str(), 0755) != 0) {
        Log("[Error] Could not create workspace directory: " + Workspace);
        exit(1);
    }
    RunningTasksFile = Workspace + "/" + FilePrefix + "running.tasks";
    StatsFile = Workspace + "/" + FilePrefix + "stats";
    BinClusterFile = Workspace + "/" + FilePrefix + "bpc";
    clusterFile = new ClusterFile(BinClusterFile);
    clusterDir = Workspace + "/clusters/";
    if(stat(clusterDir.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Cluster directory already present");
    else if (mkdir(clusterDir.c_str(), 0755) != 0) {
        Log("[Error] Could not create cluster directory: " + clusterDir);
        exit(1);
    }
    insertsizeDir = Workspace + "/insertsize/";
    if(stat(insertsizeDir.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Insertsize directory already present");
    else if (mkdir(insertsizeDir.c_str(), 0755) != 0) {
        Log("[Error] Could not create insertsize directory: " + insertsizeDir);
        exit(1);
    }
    coverageDir = Workspace + "/coverage/";
    if(stat(coverageDir.c_str(),&st) == 0 and st.st_mode and S_IFDIR != 0) Log("[Warning] Coverage directory already present");
    else if (mkdir(coverageDir.c_str(), 0755) != 0) {
        Log("[Error] Could not create coverage directory: " + coverageDir);
        exit(1);
    }
    
    if (!ForwardBam.empty() && !ReverseBam.empty() && PairedBam.empty()) {
        UsePairedBam = false;
    } else if (ForwardBam.empty() && ReverseBam.empty() && !PairedBam.empty()) {
        UsePairedBam = true;
    } else {
        Log("[Error] No correct bam file(s)");
        exit(1);
    }
    
    BamTools::BamAlignment alignment;
    BamTools::BamReader BamReader;
    
    if (UsePairedBam) {
        BamReader.Open(PairedBam);
        if (not BamReader.IsOpen()) {
            Log("[Error] Could not open paired bam");
            exit(1);
        }
        if (PairedIndex.empty()) {
            if (not BamReader.LocateIndex(BamTools::BamIndex::STANDARD)) {
                PairedIndex = PairedBam.substr(0,PairedBam.find_last_of(".bam")-3) + ".bai";
                BamReader.OpenIndex(PairedIndex);
            }
            if (not BamReader.HasIndex()) {
                Log("[Error] No index for bamfile");
                exit(1);
            }
        }
        BamTools::SamHeader header = BamReader.GetHeader();
        for (BamTools::SamReadGroupIterator it = header.ReadGroups.Begin(); it != header.ReadGroups.End(); it++) {
            BamTools::SamReadGroup* readgroup = &*it;
            readNameConverter.TrimName(readgroup->ID);
            readNameConverter.AddReadGroup(readgroup->ID);
        }
        long int count = 0;
        while (BamReader.GetNextAlignment(alignment)) {
            string RG;
            if (alignment.GetTag("RG", RG)) {
                if (not NameTrim.empty()) readNameConverter.TrimName(RG);
                if (readNameConverter.AddReadGroup(RG)) {
                    Log("[Warning] Readgroup '" + RG + "' found in reads but not in header");
                    count = 0;
                }
            }
            count++;
            if (count > 10000) break;
        }
        BamReader.Close();
    } else {
        BamReader.Open(ForwardBam);
        if (not BamReader.IsOpen()) {
            Log("[Error] Could not open first/forward bam");
            exit(1);
        }
        if (ForwardIndex.empty()) {
            if (not BamReader.LocateIndex(BamTools::BamIndex::STANDARD)) {
                ForwardIndex = ForwardBam.substr(0,ForwardBam.find_last_of(".bam")-3) + ".bai";
                BamReader.OpenIndex(ForwardIndex);
            }
            if (not BamReader.HasIndex()) {
                Log("[Error] No index for forward bamfile");
                exit(1);
            }
        }
        BamTools::SamHeader forwardheader = BamReader.GetHeader();
        for (BamTools::SamReadGroupIterator it = forwardheader.ReadGroups.Begin(); it != forwardheader.ReadGroups.End(); it++) {
            BamTools::SamReadGroup* readgroup = &*it;
            readNameConverter.TrimName(readgroup->ID);
            readNameConverter.AddReadGroup(readgroup->ID);
        }
        long int count = 0;
        while (BamReader.GetNextAlignment(alignment)) {
            string RG;
            if (alignment.GetTag("RG", RG)) {
                if (!NameTrim.empty()) readNameConverter.TrimName(RG);
                if (readNameConverter.AddReadGroup(RG)) {
                    Log("[Warning] Readgroup '" + RG + "' found in forward reads but not in header");
                    count = 0;
                }
            }
            count++;
            if (count > 10000) break;
        }
        BamReader.Close();
        BamReader.Open(ReverseBam);
        if (not BamReader.IsOpen()) {
            Log("[Error] Could not open second/reverse bam");
            exit(1);
        }
        if (ReverseIndex.empty()) {
            if (not BamReader.LocateIndex(BamTools::BamIndex::STANDARD)) {
                ReverseIndex = ReverseBam.substr(0,ReverseBam.find_last_of(".bam")-3) + ".bai";
                BamReader.OpenIndex(ReverseIndex);
            }
            if (not BamReader.HasIndex()) {
                Log("[Error] No index for reverse bamfile");
                exit(1);
            }
        }
        BamTools::SamHeader reverseheader = BamReader.GetHeader();
        for (BamTools::SamReadGroupIterator it = reverseheader.ReadGroups.Begin(); it != reverseheader.ReadGroups.End(); it++) {
            BamTools::SamReadGroup* readgroup = &*it;
            readNameConverter.TrimName(readgroup->ID);
            if (readNameConverter.AddReadGroup(readgroup->ID)) {
                Log("[Warning] Readgroup '" + readgroup->ID + "' found in reverse but not in forward");
            }
        }
        count = 0;
        while (BamReader.GetNextAlignment(alignment)) {
            string RG;
            if (alignment.GetTag("RG", RG)) {
                if (!NameTrim.empty()) readNameConverter.TrimName(RG);
                if (readNameConverter.AddReadGroup(RG)) {
                    Log("[Warning] Readgroup '" + RG + "' found in reverse reads but not in header");
                    count = 0;
                } 
            }
            count++;
            if (count > 10000) break;
        }
        BamReader.Close();
    }
    
    for(map<string, int>::iterator it = readNameConverter.ReadGroups.begin(); it!=readNameConverter.ReadGroups.end(); ++it) {
        ostringstream logBuffer;
        logBuffer << "Readgroup found: " << it->second << " - " << it->first;
        Log(logBuffer.str());
    }
    
    writeConfigFile(Workspace + FilePrefix + "config");
}
Beispiel #8
0
int scanBam()
{

	std::vector< std::map<std::string, ScanResults> > resultlist;
//	std::ostream* pWriter;
//	pWriter = &std::cout;
    bool isExome = opt::exomebedfile.size()==0? false: true;

    std::cout << opt::bamlist << "\n";
    std::cout << opt::bamlist.size() << " BAMs" <<  std::endl;

    for(std::size_t i=0; i<opt::bamlist.size(); i++) {

        // storing results for each read group (RG tag). use
        // read group ID as key.
        std::map<std::string, ScanResults> resultmap;
        // store where the overlap was last found in the case of exome seq
    	std::map<std::string, std::vector<range>::iterator> lastfound;
    	std::vector<range>::iterator searchhint;
        
        std::cerr << "Start analysing BAM " << opt::bamlist[i] << "\n";

        // Open the bam files for reading/writing
        BamTools::BamReader* pBamReader = new BamTools::BamReader;

        pBamReader->Open(opt::bamlist[i]);

        // get bam headers
        const BamTools::SamHeader header = pBamReader ->GetHeader();

//        for(BamTools::SamSequenceConstIterator it = header.Sequences.Begin();
//        						it != header.Sequences.End();++it){
//        	std::cout << "Assembly ID:" << it->AssemblyID << ", Name:" << it->Name << std::endl;
//        }
//        exit(0);

        bool rggroups=false;
        
        if(opt::ignorerg){ // ignore read groups
        	std::cerr << "Treat all reads in BAM as if they were from a same sample" << std::endl;
        	ScanResults results;
        	results.sample = opt::unknown;
        	resultmap[opt::unknown]=results;
        }else{
			std::map <std::string, std::string> readgroups;
			std::map <std::string, std::string> readlibs;

			rggroups = header.HasReadGroups();

			if(rggroups){
				for(BamTools::SamReadGroupConstIterator it = header.ReadGroups.Begin();
						it != header.ReadGroups.End();++it){
					readgroups[it->ID]= it->Sample;
					if(it->HasLibrary()){
						readlibs[it->ID] = it -> Library;
					}else{
						readlibs[it->ID] = opt::unknown;
					}
				}
				std::cerr<<"Specified BAM has "<< readgroups.size()<< " read groups" << std::endl;

				for(std::map<std::string, std::string>::iterator it = readgroups.begin(); it != readgroups.end(); ++it){
					ScanResults results;
					std::string rgid = it -> first;
					results.sample = it -> second;
					results.lib = readlibs[rgid];
					resultmap[rgid]=results; //results are identified by RG tag.
				}

			}else{
				std::cerr << "Warning: can't find RG tag in the BAM header" << std::endl;
				std::cerr << "Warning: treat all reads in BAM as if they were from a same sample" << std::endl;
				ScanResults results;
				results.sample = opt::unknown;
				results.lib = opt::unknown;
				resultmap[opt::unknown]=results;
			}
        }

        BamTools::BamAlignment record1;
        bool done = false;
        
        int nprocessed=0; // number of reads analyzed
        int ntotal=0; // number of reads scanned in bam (we skip some reads, see below)
        while(!done)
        {
            ntotal ++;
            done = !pBamReader -> GetNextAlignment(record1);
            std::string tag = opt::unknown;
            if(rggroups){
                
                // skip reads that do not have read group tag
            	if(record1.HasTag("RG")){
					record1.GetTag("RG", tag);
	//            	std::cerr << c << " reads:{" << record1.QueryBases << "} tag:{" << tag << "}\n";
				}else{
					std::cerr << "can't find RG tag for read at position {" << record1.RefID << ":" << record1.Position << "}" << std::endl;
					std::cerr << "skip this read" << std::endl;
					continue;
				}
            }
            
            // skip reads with readgroup not defined in BAM header
            if(resultmap.find(tag) == resultmap.end()){
				std::cerr << "RG tag {" << tag << "} for read at position ";
				std::cerr << "{" << record1.RefID << ":" << record1.Position << "} doesn't exist in BAM header.";
				continue;
            }

            // for exome, exclude reads mapped to the exome regions.
            if(isExome){
				range rg;
				rg.first = record1.Position;
				rg.second = record1.Position + record1.Length;
				std::string chrm =  refID2Name(record1.RefID);

				if(chrm != "-1"){ // check if overlap exome when the read is mapped to chr1-22, X, Y
					// std::cerr << "read: " << chrm << " " << rg << "\n" << std::endl;
					std::map<std::string, std::vector<range> >::iterator chrmit = opt::exomebed.find(chrm);
					if(chrmit == opt::exomebed.end())
					{
						// std::cerr<<"chromosome or reference sequence: " << chrm << " is not present in the specified exome bed file." <<std::endl;
						// std::cerr<<"please check sequence name encoding, i.e. for chromosome one, is it chr1 or 1" << std::endl;
                        // unmapped reads can have chr names as a star (*). We also don't consider MT reads. 
						resultmap[tag].n_exreadsChrUnmatched +=1; 
					}else{
						std::vector<range>::iterator itend = opt::exomebed[chrm].end();
						std::map<std::string, std::vector<range>::iterator>::iterator lastfoundchrmit = lastfound.find(chrm);
						if(lastfoundchrmit == lastfound.end()){ // first entry to this chrm
							lastfound[chrm] = chrmit->second.begin();// start from begining
						}

						// set the hint to where the previous found is
						searchhint = lastfound[chrm];
						std::vector<range>::iterator itsearch = searchRange(searchhint, itend, rg);
						// if found
						if(itsearch != itend){// if found
							searchhint = itsearch;
							resultmap[tag].n_exreadsExcluded +=1;
							lastfound[chrm] = searchhint; // update search hint
							continue;
						}
					}

				}
            }

            resultmap[tag].numTotal +=1;

            if(record1.IsMapped())
            {
            	resultmap[tag].numMapped += 1;
            }
            if(record1.IsDuplicate()){
            	resultmap[tag].numDuplicates +=1;
            }

            double gc = calcGC(record1.QueryBases);
            int ptn_count = countMotif(record1.QueryBases, opt::PATTERN, opt::PATTERN_REV);
            // when the read length exceeds 100bp, number of patterns might exceed the boundary
            if (ptn_count > ScanParameters::TEL_MOTIF_N-1){
                continue;
            }
            resultmap[tag].telcounts[ptn_count]+=1;

            if(gc >= ScanParameters::GC_LOWERBOUND && gc <= ScanParameters::GC_UPPERBOUND){
            	// get index for GC bin.
            	int idx = floor((gc-ScanParameters::GC_LOWERBOUND)/ScanParameters::GC_BINSIZE);
            	assert(idx >=0 && idx <= ScanParameters::GC_BIN_N-1);
//            	std::cerr << c << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC idx{" << idx << "}\n";
            	if(idx > ScanParameters::GC_BIN_N-1){
            		std::cerr << nprocessed << " GC:{"<< gc << "} telcounts:{"<< ptn_count <<"} GC bin index out of bound:" << idx << "\n";
            		exit(EXIT_FAILURE);
            	}
            	resultmap[tag].gccounts[idx]+=1;
            }

            // if(resultmap[tag].n_exreadsChrUnmatched > 1000){
            // 	std::cerr<<"too many reads found with unmatched chromosome ID between BAM and exome BED. \n" << std::endl;
            // }

            nprocessed++;

            if( nprocessed%10000000 == 0){
            	std::cerr << "[scan] processed " << nprocessed << " reads \n" ;
            }
        }
        
		pBamReader->Close();
        delete pBamReader;
        
        // consider each BAM separately
        resultlist.push_back(resultmap);

        std::cerr << "[scan] total reads in BAM scanned " << ntotal << std::endl;
        std::cerr << "Completed scanning BAM\n";

    }

    if(opt::onebam){
        merge_results_by_readgroup(resultlist);
    }
    
    outputresults(resultlist);

    if(isExome){
    	printlog(resultlist);
    }
    
    std::cerr << "Completed writing results\n";

    return 0;
}
Beispiel #9
0
inline int
run(Config const& c, TSingleHit)
{
  // Create library objects
  typedef std::map<std::string, LibraryInfo> TLibraryMap;
  typedef std::map<std::string, TLibraryMap> TSampleLibrary;
  TSampleLibrary sampleLib;

  // Scan libraries
  for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
    // Get a sample name
    std::string sampleName(c.files[file_c].stem().string());

    // Check that all input bam files exist
    BamTools::BamReader reader;
    if ( ! reader.Open(c.files[file_c].string()) ) {
      std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl;
      reader.Close();
      return -1;
    }
    
    // Check that all input bam files are indexed
    reader.LocateIndex();
    if ( !reader.HasIndex() ) {
      std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl;
      reader.Close();
      return -1;
    }

    // Get library parameters and overall maximum insert size
    TLibraryMap libInfo;
    getLibraryParams(c.files[file_c], libInfo, 0, 5);
    sampleLib.insert(std::make_pair(sampleName, libInfo));
  }

  // Read all SV intervals
  typedef std::vector<StructuralVariantRecord> TSVs;
  TSVs svs;
  std::map<unsigned int, std::string> idToName;
  unsigned int intervalCount=1;
  if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) {
    Memory_mapped_file interval_file(c.int_file.string().c_str());
    char interval_buffer[Memory_mapped_file::MAX_LINE_LENGTH];
    while (interval_file.left_bytes() > 0) {
      interval_file.read_line(interval_buffer);
      // Read single interval line
      StructuralVariantRecord sv;
      Tokenizer token(interval_buffer, Memory_mapped_file::MAX_LINE_LENGTH);
      std::string interval_rname;
      token.getString(sv.chr);
      sv.svStart = token.getUInt();
      sv.svEnd = token.getUInt() + 1;
      std::string svName;
      token.getString(svName);
      idToName.insert(std::make_pair(intervalCount, svName));
      sv.id = intervalCount++;
      svs.push_back(sv);
    }
    interval_file.close();
  } else {
    // Create artificial intervals
    BamTools::BamReader readerRef;
    if ( ! readerRef.Open(c.files[0].string()) ) return -1;
    BamTools::RefVector references = readerRef.GetReferenceData();
    typename BamTools::RefVector::const_iterator itRef = references.begin();
    for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) {
      int32_t pos = 0;
      while (pos < references[refIndex].RefLength) {
	int32_t window_len = pos+c.window_size;
	if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength;
	StructuralVariantRecord sv;
	sv.chr = references[refIndex].RefName;
	sv.svStart = pos;
	sv.svEnd = window_len;
	std::stringstream s; 
	s << sv.chr << ":" << sv.svStart << "-" << sv.svEnd;
	idToName.insert(std::make_pair(intervalCount, s.str()));
	sv.id = intervalCount++;
	svs.push_back(sv);
	pos += c.window_offset;
      }
    }
  }

  // Output data types
  typedef std::pair<std::string, int> TSampleSVPair;
  typedef std::pair<int, int> TBpRead;
  typedef std::map<TSampleSVPair, TBpRead> TCountMap;
  TCountMap countMap;

  // Annotate coverage
  annotateCoverage(c.files, c.minMapQual, c.inclCigar, sampleLib, svs, countMap, TSingleHit());

  // Output library statistics
  std::cout << "Library statistics" << std::endl;
  TSampleLibrary::const_iterator sampleIt=sampleLib.begin();
  for(;sampleIt!=sampleLib.end();++sampleIt) {
    std::cout << "Sample: " << sampleIt->first << std::endl;
    TLibraryMap::const_iterator libIt=sampleIt->second.begin();
    for(;libIt!=sampleIt->second.end();++libIt) {
      std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << ",MappedReads=" << libIt->second.mappedReads << ",DuplicatePairs=" << libIt->second.non_unique_pairs << ",UniquePairs=" << libIt->second.unique_pairs << std::endl;
    }
  }

  // Output file
  boost::iostreams::filtering_ostream dataOut;
  dataOut.push(boost::iostreams::gzip_compressor());
  dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary));

  // Iterate all SVs
  typename TSVs::const_iterator itSV = svs.begin();
  typename TSVs::const_iterator itSVEnd = svs.end();
  for(;itSV!=itSVEnd;++itSV) {
    dataOut << itSV->chr << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second;
    // Iterate all samples
    for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
      // Get the sample name
      std::string sampleName(c.files[file_c].stem().string());
      TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id);
      typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair);
      dataOut << "\t";
      if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t";
      if (c.bp_flag) dataOut << countMapIt->second.first << "\t";
      dataOut << countMapIt->second.second;
    }
    dataOut << std::endl;
  }

  // End
  boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
  std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;;
  return 0;
}
Beispiel #10
0
//
// Main
//
int filterBAMMain(int argc, char** argv)
{
    parseFilterBAMOptions(argc, argv);

    // Read the graph if distance-filtering mode is enabled
    StringGraph* pGraph = NULL;
    if(!opt::asqgFile.empty())
        pGraph = SGUtil::loadASQG(opt::asqgFile, 0, false);

    // Read the BWTs if depth-filtering mode is enabled
    BWT* pBWT = NULL;
    BWT* pRBWT = NULL;
    if(!opt::fmIndexPrefix.empty())
    {
        pBWT = new BWT(opt::fmIndexPrefix + BWT_EXT, opt::sampleRate);
        pRBWT = new BWT(opt::fmIndexPrefix + RBWT_EXT, opt::sampleRate);
    }

    Timer* pTimer = new Timer(PROGRAM_IDENT);    

    // 
    int numPairsTotal = 0;
    int numPairsFilteredByDistance = 0;
    int numPairsFilteredByER = 0;
    int numPairsFilteredByQuality = 0;
    int numPairsFilteredByDepth = 0;
    int numPairsUnmapped = 0;
    int numPairsWrote = 0;

    // Open the bam files for reading/writing
    BamTools::BamReader* pBamReader = new BamTools::BamReader;
    pBamReader->Open(opt::bamFile);

    BamTools::BamWriter* pBamWriter = new BamTools::BamWriter;
    pBamWriter->Open(opt::outFile, pBamReader->GetHeaderText(), pBamReader->GetReferenceData());
    const BamTools::RefVector& referenceVector = pBamReader->GetReferenceData();


    BamTools::BamAlignment record1;
    BamTools::BamAlignment record2;
    bool done = false;

    while(!done)
    {
        if(numPairsTotal++ % 200000 == 0)
            printf("[sga filterBAM] Processed %d pairs\n", numPairsTotal);

        done = !readAlignmentPair(pBamReader, record1, record2);
        if(done)
            break;

        if(!record1.IsMapped() || !record2.IsMapped())
        {
            numPairsUnmapped += 1;
            continue;
        }

        // Ensure the pairing is correct
        if(record1.Name != record2.Name)
        {
            std::cout << "NAME FAIL: " << record1.Name << " " << record2.Name << "\n";
        }
        assert(record1.Name == record2.Name);
        bool bPassedFilters = true;

        // Check if the error rate is below the max
        double er1 = getErrorRate(record1);
        double er2 = getErrorRate(record2);

        if(er1 > opt::maxError || er2 > opt::maxError)
        {
            bPassedFilters = false;
            numPairsFilteredByER += 1;
        }

        if(record1.MapQuality < opt::minQuality || record2.MapQuality < opt::minQuality)
        {
            bPassedFilters = false;
            numPairsFilteredByQuality += 1;
        }

        // Perform depth check for pairs aligning to different contigs
        if(bPassedFilters && (pBWT != NULL && pRBWT != NULL && opt::maxKmerDepth > 0) && (record1.RefID != record2.RefID))
        {
            int maxDepth1 = getMaxKmerDepth(record1.QueryBases, pBWT, pRBWT);
            int maxDepth2 = getMaxKmerDepth(record1.QueryBases, pBWT, pRBWT);
            if(maxDepth1 > opt::maxKmerDepth || maxDepth2 > opt::maxKmerDepth)
            {
                bPassedFilters = false;
                numPairsFilteredByDepth += 1;
            }
        }

        // Perform short-insert pair check
        if(pGraph != NULL)
        {
            bPassedFilters = bPassedFilters && filterByGraph(pGraph, referenceVector, record1, record2);
            numPairsFilteredByDistance += 1;
        }
        if(bPassedFilters)
        {
            pBamWriter->SaveAlignment(record1);
            pBamWriter->SaveAlignment(record2);
            numPairsWrote += 1;
        }
    }

    std::cout << "Total pairs: " << numPairsTotal << "\n";
    std::cout << "Total pairs output: " << numPairsWrote << "\n";
    std::cout << "Total filtered because one pair is unmapped: " << numPairsUnmapped << "\n";
    std::cout << "Total filtered by distance: " << numPairsFilteredByDistance << "\n";
    std::cout << "Total filtered by error rate: " << numPairsFilteredByER << "\n";
    std::cout << "Total filtered by quality: " << numPairsFilteredByQuality << "\n";
    std::cout << "Total filtered by depth: " << numPairsFilteredByDepth << "\n";
    
    if(pGraph != NULL)
        delete pGraph;

    if(pBWT != NULL)
        delete pBWT;

    if(pRBWT != NULL)
        delete pRBWT;

    pBamWriter->Close();
    pBamReader->Close();

    delete pTimer;
    delete pBamReader;
    delete pBamWriter;
    return 0;
}
Beispiel #11
0
void MyBamGroup::ReadGroup(char *bamFile){

	BamTools::BamReader bamReader;
	if(!bamReader.Open(std::string(bamFile))) {
		 errMsg = "Failed to open bam " + std::string(bamFile) + "\n";
	} else {
		BamTools::SamHeader samHeader = bamReader.GetHeader();
		for (BamTools::SamReadGroupIterator itr = samHeader.ReadGroups.Begin(); itr != samHeader.ReadGroups.End(); ++itr ) {
			if(itr->HasID()) {
				ID.push_back(itr->ID);
			} else {
				ID.push_back("");
			}
			if(itr->HasFlowOrder()) {
				FlowOrder.push_back(itr->FlowOrder);
			} else {
				FlowOrder.push_back("");
			}
			if(itr->HasKeySequence()) {
				KeySequence.push_back(itr->KeySequence);
			} else {
				KeySequence.push_back("");
			}
			if(itr->HasDescription()) {
				Description.push_back(itr->Description);
			} else {
				Description.push_back("");
			}
			if(itr->HasLibrary()) {
				Library.push_back(itr->Library);
			} else {
				Library.push_back("");
			}
			if(itr->HasPlatformUnit()) {
				PlatformUnit.push_back(itr->PlatformUnit);
			} else {
				PlatformUnit.push_back("");
			}
			if(itr->HasPredictedInsertSize()) {
				PredictedInsertSize.push_back(itr->PredictedInsertSize);
			} else {
				PredictedInsertSize.push_back("");
			}
			if(itr->HasProductionDate()) {
				ProductionDate.push_back(itr->ProductionDate);
			} else {
				ProductionDate.push_back("");
			}
			if(itr->HasProgram()) {
				Program.push_back(itr->Program);
			} else {
				Program.push_back("");
			}
			if(itr->HasSample()) {
				Sample.push_back(itr->Sample);
			} else {
				Sample.push_back("");
			}
			if(itr->HasSequencingCenter()) {
				SequencingCenter.push_back(itr->SequencingCenter);
			} else {
				SequencingCenter.push_back("");
			}
			if(itr->HasSequencingTechnology()) {
				SequencingTechnology.push_back(itr->SequencingTechnology);
			} else {
				SequencingTechnology.push_back("");
			}
		}
		bamReader.Close();
	}

}
Beispiel #12
0
inline int
run(Config const& c, TCoverageType covType)
{
  // Create library objects
  typedef boost::unordered_map<std::string, LibraryInfo> TLibraryMap;
  typedef boost::unordered_map<std::string, TLibraryMap> TSampleLibrary;
  TSampleLibrary sampleLib;

  // Scan libraries
  for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
    // Get a sample name
    std::string sampleName(c.files[file_c].stem().string());

    // Check that all input bam files exist
    BamTools::BamReader reader;
    if ( ! reader.Open(c.files[file_c].string()) ) {
      std::cerr << "Could not open input bam file: " << c.files[file_c].string() << std::endl;
      reader.Close();
      return -1;
    }
    
    // Check that all input bam files are indexed
    reader.LocateIndex();
    if ( !reader.HasIndex() ) {
      std::cerr << "Missing bam index file: " << c.files[file_c].string() << std::endl;
      reader.Close();
      return -1;
    }

    // Get library parameters and overall maximum insert size
    TLibraryMap libInfo;
    getLibraryParams(c.files[file_c], libInfo, 0, 5);
    sampleLib.insert(std::make_pair(sampleName, libInfo));
  }

  // Get references
  BamTools::BamReader readerRef;
  if ( ! readerRef.Open(c.files[0].string()) ) return -1;
  BamTools::RefVector references = readerRef.GetReferenceData();

  // Read all SV intervals
  typedef std::vector<CovRecord> TSVs;
  TSVs svs;
  std::map<unsigned int, std::string> idToName;
  unsigned int intervalCount=1;
  if (boost::filesystem::exists(c.int_file) && boost::filesystem::is_regular_file(c.int_file) && boost::filesystem::file_size(c.int_file)) {
    typedef boost::unordered_map<std::string, unsigned int> TMapChr;
    TMapChr mapChr;
    typename BamTools::RefVector::const_iterator itRef = references.begin();
    for(unsigned int i = 0;itRef!=references.end();++itRef, ++i) mapChr[ itRef->RefName ] = i;
    std::ifstream interval_file(c.int_file.string().c_str(), std::ifstream::in);
    if (interval_file.is_open()) {
      while (interval_file.good()) {
	std::string intervalLine;
	getline(interval_file, intervalLine);
	typedef boost::tokenizer< boost::char_separator<char> > Tokenizer;
	boost::char_separator<char> sep(" \t,;");
	Tokenizer tokens(intervalLine, sep);
	Tokenizer::iterator tokIter = tokens.begin();
	if (tokIter!=tokens.end()) {
	  std::string chrName=*tokIter++;
	  TMapChr::const_iterator mapChrIt = mapChr.find(chrName);
	  if (mapChrIt != mapChr.end()) {
	    if (tokIter!=tokens.end()) {
	      CovRecord sv;	  
	      sv.chr = mapChrIt->second;
	      sv.svStart = boost::lexical_cast<int32_t>(*tokIter++);
	      sv.svEnd = boost::lexical_cast<int32_t>(*tokIter++) + 1;
	      std::string svName = *tokIter;
	      idToName.insert(std::make_pair(intervalCount, svName));
	      sv.id = intervalCount++;
	      svs.push_back(sv);
	    }
	  }
	}
      }
      interval_file.close();
    }
  } else {
    // Create artificial intervals
    typename BamTools::RefVector::const_iterator itRef = references.begin();
    for(int refIndex=0;itRef!=references.end();++itRef, ++refIndex) {
      int32_t pos = 0;
      unsigned int wSize = c.window_size;
      unsigned int wOffset = c.window_offset;
      if (c.window_num>0) {
	wSize=(itRef->RefLength / c.window_num) + 1;
	wOffset=wSize;
      }
      while (pos < references[refIndex].RefLength) {
	int32_t window_len = pos+wSize;
	if (window_len > references[refIndex].RefLength) window_len = references[refIndex].RefLength;
	CovRecord sv;
	sv.chr = refIndex;
	sv.svStart = pos;
	sv.svEnd = window_len;
	std::stringstream s; 
	s << references[sv.chr].RefName << ":" << sv.svStart << "-" << sv.svEnd;
	idToName.insert(std::make_pair(intervalCount, s.str()));
	sv.id = intervalCount++;
	svs.push_back(sv);
	pos += wOffset;
      }
    }
  }

  // Output data types
  typedef std::pair<std::string, int> TSampleSVPair;
  typedef std::pair<int, int> TBpRead;
  typedef std::map<TSampleSVPair, TBpRead> TCountMap;
  TCountMap countMap;

  // Annotate coverage
  if (c.inclCigar) annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<BpLevelCount>(), covType);
  else annotateCoverage(c.files, c.minGenoQual, sampleLib, svs, countMap, BpLevelType<NoBpLevelCount>(), covType);

  // Output library statistics
  std::cout << "Library statistics" << std::endl;
  TSampleLibrary::const_iterator sampleIt=sampleLib.begin();
  for(;sampleIt!=sampleLib.end();++sampleIt) {
    std::cout << "Sample: " << sampleIt->first << std::endl;
    TLibraryMap::const_iterator libIt=sampleIt->second.begin();
    for(;libIt!=sampleIt->second.end();++libIt) {
      std::cout << "RG: ID=" << libIt->first << ",Median=" << libIt->second.median << ",MAD=" << libIt->second.mad << ",Orientation=" << (int) libIt->second.defaultOrient << std::endl;
    }
  }

  // Output file
  boost::iostreams::filtering_ostream dataOut;
  dataOut.push(boost::iostreams::gzip_compressor());
  dataOut.push(boost::iostreams::file_sink(c.outfile.string().c_str(), std::ios_base::out | std::ios_base::binary));

  // Print header
  dataOut << "#chr\tstart\tend\tid";
  for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
    std::string sampleName(c.files[file_c].stem().string());
    dataOut << "\t";
    if (c.avg_flag) dataOut << sampleName << "_avgcov" << "\t";
    if (c.bp_flag) dataOut << sampleName << "_bpcount" << "\t";
    if ((c.bp_flag) || (c.avg_flag)) dataOut << sampleName << "_readcount";
    else dataOut << sampleName;
  }
  dataOut << std::endl;

  // Iterate all SVs
  typename TSVs::const_iterator itSV = svs.begin();
  typename TSVs::const_iterator itSVEnd = svs.end();
  for(;itSV!=itSVEnd;++itSV) {
    dataOut << references[itSV->chr].RefName << "\t" << itSV->svStart << "\t" << itSV->svEnd << "\t" << idToName.find(itSV->id)->second;
    // Iterate all samples
    for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
      // Get the sample name
      std::string sampleName(c.files[file_c].stem().string());
      TSampleSVPair sampleSVPair = std::make_pair(sampleName, itSV->id);
      typename TCountMap::iterator countMapIt=countMap.find(sampleSVPair);
      dataOut << "\t";
      if (c.avg_flag) dataOut << ( (countMapIt->second.first) / (double) (itSV->svEnd - itSV->svStart)) << "\t";
      if (c.bp_flag) dataOut << countMapIt->second.first << "\t";
      dataOut << countMapIt->second.second;
    }
    dataOut << std::endl;
  }

  // End
  boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
  std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;;
  return 0;
}