// makes a virtual, unified header for all the bam files in the multireader string BamMultiReaderPrivate::GetHeaderText(void) const { // N.B. - right now, simply copies all header data from first BAM, // and then appends RG's from other BAM files // TODO: make this more intelligent wrt other header lines/fields // if no readers open const size_t numReaders = m_readers.size(); if ( numReaders == 0 ) return string(); // retrieve first reader's header const MergeItem& firstItem = m_readers.front(); const BamReader* reader = firstItem.Reader; if ( reader == 0 ) return string(); SamHeader mergedHeader = reader->GetHeader(); // Add filename to read ID name so we know // which file it came from for (SamReadGroupIterator it = mergedHeader.ReadGroups.Begin(); it != mergedHeader.ReadGroups.End(); it++) { it->ID = it->ID + "-" + reader->GetFilename(); } // iterate over any remaining readers (skipping the first) for ( size_t i = 1; i < numReaders; ++i ) { const MergeItem& item = m_readers.at(i); const BamReader* reader = item.Reader; if ( reader == 0 ) continue; // retrieve current reader's header SamHeader currentHeader = reader->GetHeader(); // Add filename to read ID name so we know // which file it came from for (SamReadGroupIterator it = currentHeader.ReadGroups.Begin(); it != currentHeader.ReadGroups.End(); it++) { it->ID = it->ID + "-" + reader->GetFilename(); } // append current reader's RG entries to merged header // N.B. - SamReadGroupDictionary handles duplicate-checking mergedHeader.ReadGroups.Add(currentHeader.ReadGroups); // TODO: merge anything else?? } // return stringified header return mergedHeader.ToString(); }
/*! \fn bool BamWriter::Open(const std::string& filename, const SamHeader& samHeader, const RefVector& referenceSequences) \brief Opens a BAM file for writing. This is an overloaded function. Will overwrite the BAM file if it already exists. \param[in] filename name of output BAM file \param[in] samHeader header data, wrapped in SamHeader object \param[in] referenceSequences list of reference entries \return \c true if opened successfully \sa Close(), IsOpen(), BamReader::GetHeader(), BamReader::GetReferenceData() */ bool BamWriter::Open(const std::string& filename, const SamHeader& samHeader, const RefVector& referenceSequences) { d->SetParallel(m_numThreads); return d->Open(filename, samHeader.ToString(), referenceSequences); }
void RecalibrationHandler::ReadRecalibrationFromComments(const SamHeader &samHeader, int max_flows_protect) { // Read comment lines from Sam header // this will grab json files if (samHeader.HasComments()) { // parse the comment lines for (unsigned int i_co=0; i_co<samHeader.Comments.size(); i_co++) { // try printing for now //cout << samHeader.Comments[i_co] << endl; // might be all sorts of comments in the file // therefore must find the unlikely magic code in the line before trying to parse string magic_code = "6d5b9d29ede5f176a4711d415d769108"; // md5hash "This uniquely identifies json comments for recalibration." bool valid_line = false; std::size_t found = samHeader.Comments[i_co].find(magic_code); if (found !=std::string::npos) valid_line = true; if (valid_line) { // very likely to be a properly formatted json object coming from basecaller Json::Value recal_params(Json::objectValue); Json::Reader recal_reader; bool parse_success = recal_reader.parse(samHeader.Comments[i_co], recal_params); if (!parse_success) { cout << "failed to parse comment line" << recal_reader.getFormattedErrorMessages() << endl; } else { // you are a recalibration object waiting to happen // let us parse you // basic ID //cout << my_members[0] << endl; string my_block_key = recal_params["MasterKey"].asCString(); //cout << my_block_key << "\t" << recal_params[my_block_key]["modelParameters"].size() << endl; recalModel.InitializeFromJSON(recal_params, my_block_key, false,max_flows_protect); // don't spam here // add a map to this entry bam_header_recalibration.insert(pair<string,RecalibrationModel>(my_block_key, recalModel)); // parse out important information from the block key // must look like <runid>.block_X%d_Y%d int end_runid = my_block_key.find("."); int bloc_loc = my_block_key.find("block_X")+7; int y_loc = my_block_key.find("_Y"); // glorified assembly language string runid = my_block_key.substr(0,end_runid); int x_coord = atoi(my_block_key.substr(bloc_loc,y_loc-bloc_loc).c_str()); int y_coord = atoi(my_block_key.substr(y_loc+2, my_block_key.size()-y_loc+2).c_str()); //cout << runid << "\t" << x_coord << "\t" << y_coord << endl; block_hash.insert(pair<string, pair<int,int > >(runid,pair<int,int>(x_coord,y_coord))); is_live = true; // found at least one recalibration entry } } } } // okay, now, avoid spamming with possibly large number of lines if (is_live){ cout << "Recalibration was detected from comment lines in bam file(s)" << endl; cout << bam_header_recalibration.size() << " unique blocks of recalibration info detected." << endl; } }
/*! \fn SamHeader::SamHeader(const SamHeader& other) \brief copy constructor */ SamHeader::SamHeader(const SamHeader& other) : Version(other.Version) , SortOrder(other.SortOrder) , GroupOrder(other.GroupOrder) , CustomTags(other.CustomTags) , Sequences(other.Sequences) , ReadGroups(other.ReadGroups) , Programs(other.Programs) , Comments(other.Comments) , m_errorString(other.GetErrorString()) {}
//{{{bool sort_inter_chrom_bam(string in_file_name, bool sort_inter_chrom_bam(string in_file_name, string out_file_name) { // open input BAM file BamReader reader; if ( !reader.Open(in_file_name) ) { cerr << "sort ERROR: could not open " << in_file_name << " for reading... Aborting." << endl; return false; } SamHeader header = reader.GetHeader(); if ( !header.HasVersion() ) header.Version = Constants::SAM_CURRENT_VERSION; string header_text = header.ToString(); RefVector ref = reader.GetReferenceData(); // set up alignments buffer BamAlignment al; vector<BamAlignment> buffer; buffer.reserve( (size_t)(SORT_DEFAULT_MAX_BUFFER_COUNT*1.1) ); bool bufferFull = false; int buff_count = 0; // iterate through file while ( reader.GetNextAlignment(al)) { // check buffer's usage bufferFull = ( buffer.size() >= SORT_DEFAULT_MAX_BUFFER_COUNT ); // store alignments until buffer is "full" if ( !bufferFull ) buffer.push_back(al); // if buffer is "full" else { // so create a sorted temp file with current buffer contents // then push "al" into fresh buffer create_sorted_temp_file(buffer, out_file_name, buff_count, header_text, ref); ++buff_count; buffer.push_back(al); } } // handle any leftover buffer contents if ( !buffer.empty() ) { create_sorted_temp_file(buffer, out_file_name, buff_count, header_text, ref); ++buff_count; } reader.Close(); return merge_sorted_files(out_file_name, buff_count, header_text, ref); /* for (int i = 0; i < buff_count; ++i) { stringstream temp_name; temp_name << out_file_name << i; } */ }
// generates mutiple sorted temp BAM files from single unsorted BAM file bool SortTool::SortToolPrivate::GenerateSortedRuns(void) { // open input BAM file BamReader reader; if ( !reader.Open(m_settings->InputBamFilename) ) { cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename << " for reading... Aborting." << endl; return false; } // get basic data that will be shared by all temp/output files SamHeader header = reader.GetHeader(); header.SortOrder = ( m_settings->IsSortingByName ? Constants::SAM_HD_SORTORDER_QUERYNAME : Constants::SAM_HD_SORTORDER_COORDINATE ); m_headerText = header.ToString(); m_references = reader.GetReferenceData(); // set up alignments buffer BamAlignment al; vector<BamAlignment> buffer; buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) ); bool bufferFull = false; // if sorting by name, we need to generate full char data // so can't use GetNextAlignmentCore() if ( m_settings->IsSortingByName ) { // iterate through file while ( reader.GetNextAlignment(al)) { // check buffer's usage bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); // store alignments until buffer is "full" if ( !bufferFull ) buffer.push_back(al); // if buffer is "full" else { // push any unmapped reads into buffer, // don't want to split these into a separate temp file if ( !al.IsMapped() ) buffer.push_back(al); // "al" is mapped, so create a sorted temp file with current buffer contents // then push "al" into fresh buffer else { CreateSortedTempFile(buffer); buffer.push_back(al); } } } } // sorting by position, can take advantage of GNACore() speedup else { // iterate through file while ( reader.GetNextAlignmentCore(al) ) { // check buffer's usage bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); // store alignments until buffer is "full" if ( !bufferFull ) buffer.push_back(al); // if buffer is "full" else { // push any unmapped reads into buffer, // don't want to split these into a separate temp file if ( !al.IsMapped() ) buffer.push_back(al); // "al" is mapped, so create a sorted temp file with current buffer contents // then push "al" into fresh buffer else { CreateSortedTempFile(buffer); buffer.push_back(al); } } } } // handle any leftover buffer contents if ( !buffer.empty() ) CreateSortedTempFile(buffer); // close reader & return success reader.Close(); return true; }
/*! \fn bool BamWriter::Open(const std::string& filename, const SamHeader& samHeader, const RefVector& referenceSequences) \brief Opens a BAM file for writing. This is an overloaded function. Will overwrite the BAM file if it already exists. \param filename name of output BAM file \param samHeader header data, wrapped in SamHeader object \param referenceSequences list of reference entries \return \c true if opened successfully \sa Close(), IsOpen(), BamReader::GetHeader(), BamReader::GetReferenceData() */ bool BamWriter::Open(const std::string& filename, const SamHeader& samHeader, const RefVector& referenceSequences) { return d->Open(filename, samHeader.ToString(), referenceSequences); }
void RecalibrationHandler::ReadRecalibrationFromComments(const SamHeader &samHeader, const map<string, int> &max_flows_by_run_id) { if (not samHeader.HasComments()) return; unsigned int num_parsing_errors = 0; // Read comment lines from Sam header for (unsigned int i_co=0; i_co<samHeader.Comments.size(); i_co++) { // There might be all sorts of comments in the file // therefore must find the unlikely magic code in the line before trying to parse string magic_code = "6d5b9d29ede5f176a4711d415d769108"; // md5hash "This uniquely identifies json comments for recalibration." if (samHeader.Comments[i_co].find(magic_code) == std::string::npos) { //cout << endl << "No magic code found in comment line "<< i_co <<endl; //cout << samHeader.Comments.at(i_co) << endl; continue; } // Parse recalibration Json object Json::Value recal_params(Json::objectValue); Json::Reader recal_reader; if (not recal_reader.parse(samHeader.Comments[i_co], recal_params)) { cerr << "Failed to parse recalibration comment line " << recal_reader.getFormattedErrorMessages() << endl; num_parsing_errors++; continue; } string my_block_key = recal_params["MasterKey"].asString(); // Assumes that the MasterKey is written in the format <run_id>.block_X<x_offset>_Y<y_offset> int end_runid = my_block_key.find("."); int x_loc = my_block_key.find("block_X")+7; int y_loc = my_block_key.find("_Y"); // glorified assembly language string runid = my_block_key.substr(0,end_runid); int x_coord = atoi(my_block_key.substr(x_loc,y_loc-x_loc).c_str()); int y_coord = atoi(my_block_key.substr(y_loc+2, my_block_key.size()-y_loc+2).c_str()); // Protection against not having a flow order for a specified recalibration run id std::map<string, int>::const_iterator n_flows = max_flows_by_run_id.find(runid); if (n_flows == max_flows_by_run_id.end()) { cerr << "TVC ERROR: Recalibration information found for run id " << runid << " but there is no matching read group with this run id in the bam header." << endl; exit(EXIT_FAILURE); } //recalModel.InitializeFromJSON(recal_params, my_block_key, false, max_flows_by_run_id.at(runid)); // void RecalibrationModel::InitializeFromJSON(Json::Value &recal_params, string &my_block_key, bool spam_enabled, int over_flow_protect) { // The calibration comment line contains info about the hp threshold used during base calling, so set to zero here // XXX FIXME: The number of flows in the TVC group can be larger than the one specified in the calibration block. recalModel.InitializeModelFromJson(recal_params, n_flows->second); bam_header_recalibration.insert(pair<string,LinearCalibrationModel>(my_block_key, recalModel)); block_hash.insert(pair<string, pair<int,int > >(runid,pair<int,int>(x_coord,y_coord))); is_live = true; // found at least one recalibration entry } // Verbose output if (is_live){ cout << "Recalibration was detected from comment lines in bam file(s):" << endl; cout << bam_header_recalibration.size() << " unique blocks of recalibration info detected." << endl; } if (num_parsing_errors > 0) { cout << "Failed to parse " << num_parsing_errors << " recalibration comment lines." << endl; } }