// makes a virtual, unified header for all the bam files in the multireader
string BamMultiReaderPrivate::GetHeaderText(void) const {

    // N.B. - right now, simply copies all header data from first BAM,
    //        and then appends RG's from other BAM files
    // TODO: make this more intelligent wrt other header lines/fields

    // if no readers open
    const size_t numReaders = m_readers.size();
    if ( numReaders == 0 ) return string();

    // retrieve first reader's header
    const MergeItem& firstItem = m_readers.front();
    const BamReader* reader = firstItem.Reader;
    if ( reader == 0 ) return string();
    SamHeader mergedHeader = reader->GetHeader();

    // Add filename to read ID name so we know
    // which file it came from
    for (SamReadGroupIterator it = mergedHeader.ReadGroups.Begin();
         it != mergedHeader.ReadGroups.End(); it++) {
      it->ID = it->ID + "-" + reader->GetFilename();
    }

    // iterate over any remaining readers (skipping the first)
    for ( size_t i = 1; i < numReaders; ++i ) {
        const MergeItem& item = m_readers.at(i);
        const BamReader* reader = item.Reader;
        if ( reader == 0 ) continue;

        // retrieve current reader's header
        SamHeader currentHeader = reader->GetHeader();

        // Add filename to read ID name so we know
        // which file it came from
        for (SamReadGroupIterator it = currentHeader.ReadGroups.Begin();
               it != currentHeader.ReadGroups.End(); it++) {
          it->ID = it->ID + "-" + reader->GetFilename();
        }

        // append current reader's RG entries to merged header
        // N.B. - SamReadGroupDictionary handles duplicate-checking
        mergedHeader.ReadGroups.Add(currentHeader.ReadGroups);

        // TODO: merge anything else??
    }

    // return stringified header
    return mergedHeader.ToString();
}
/*! \fn bool BamWriter::Open(const std::string& filename,
                             const SamHeader& samHeader,
                             const RefVector& referenceSequences)
    \brief Opens a BAM file for writing.

    This is an overloaded function.

    Will overwrite the BAM file if it already exists.

    \param[in] filename           name of output BAM file
    \param[in] samHeader          header data, wrapped in SamHeader object
    \param[in] referenceSequences list of reference entries

    \return \c true if opened successfully
    \sa Close(), IsOpen(), BamReader::GetHeader(), BamReader::GetReferenceData()
*/
bool BamWriter::Open(const std::string& filename,
                     const SamHeader& samHeader,
                     const RefVector& referenceSequences)
{
    d->SetParallel(m_numThreads);
    return d->Open(filename, samHeader.ToString(), referenceSequences);
}
void RecalibrationHandler::ReadRecalibrationFromComments(const SamHeader &samHeader, int max_flows_protect) {
    // Read comment lines from Sam header
    // this will grab json files
    if (samHeader.HasComments()) {
        // parse the comment lines
        for (unsigned int i_co=0; i_co<samHeader.Comments.size(); i_co++) {
            // try printing for now
            //cout << samHeader.Comments[i_co] << endl;
            // might be all sorts of comments in the file
            // therefore must find the unlikely magic code in the line before trying to parse
            string magic_code = "6d5b9d29ede5f176a4711d415d769108"; // md5hash "This uniquely identifies json comments for recalibration."
            bool valid_line = false;
            std::size_t found = samHeader.Comments[i_co].find(magic_code);
            if (found !=std::string::npos)
                valid_line = true;

            if (valid_line) {
                // very likely to be a properly formatted json object coming from basecaller
                Json::Value recal_params(Json::objectValue);
                Json::Reader recal_reader;
                bool parse_success = recal_reader.parse(samHeader.Comments[i_co], recal_params);
                if (!parse_success) {
                    cout << "failed to parse comment line" << recal_reader.getFormattedErrorMessages() << endl;
                } else {
                    // you are a recalibration object waiting to happen
                    // let us parse you
                    // basic ID

                    //cout << my_members[0] << endl;
                    string my_block_key = recal_params["MasterKey"].asCString();
                    //cout << my_block_key << "\t" << recal_params[my_block_key]["modelParameters"].size() << endl;
                    recalModel.InitializeFromJSON(recal_params, my_block_key, false,max_flows_protect);  // don't spam here
                    // add a map to this entry
                    bam_header_recalibration.insert(pair<string,RecalibrationModel>(my_block_key, recalModel));
                    // parse out important information from the block key
                    // must look like <runid>.block_X%d_Y%d
                    int end_runid = my_block_key.find(".");
                    int bloc_loc = my_block_key.find("block_X")+7;
                    int y_loc = my_block_key.find("_Y");
                    // glorified assembly language
                    string runid = my_block_key.substr(0,end_runid);
                    int x_coord = atoi(my_block_key.substr(bloc_loc,y_loc-bloc_loc).c_str());
                    int y_coord = atoi(my_block_key.substr(y_loc+2, my_block_key.size()-y_loc+2).c_str());
                    //cout << runid << "\t" << x_coord << "\t" << y_coord << endl;
                    block_hash.insert(pair<string, pair<int,int > >(runid,pair<int,int>(x_coord,y_coord)));
                    is_live = true; // found at least one recalibration entry
                }
            }
        }
    }

    // okay, now, avoid spamming with possibly large number of lines
    if (is_live){
      cout << "Recalibration was detected from comment lines in bam file(s)" << endl;
      cout << bam_header_recalibration.size() << " unique blocks of recalibration info detected." << endl;
    }
}
Esempio n. 4
0
/*! \fn SamHeader::SamHeader(const SamHeader& other)
    \brief copy constructor
*/
SamHeader::SamHeader(const SamHeader& other)
    : Version(other.Version)
    , SortOrder(other.SortOrder)
    , GroupOrder(other.GroupOrder)
    , CustomTags(other.CustomTags)
    , Sequences(other.Sequences)
    , ReadGroups(other.ReadGroups)
    , Programs(other.Programs)
    , Comments(other.Comments)
    , m_errorString(other.GetErrorString())
{}
Esempio n. 5
0
//{{{bool sort_inter_chrom_bam(string in_file_name,
bool sort_inter_chrom_bam(string in_file_name,
						  string out_file_name)
{
    // open input BAM file
    BamReader reader;
    if ( !reader.Open(in_file_name) ) {
        cerr << "sort ERROR: could not open " << 
			in_file_name << " for reading... Aborting." << endl;
        return false;
    }

    SamHeader header = reader.GetHeader();
    if ( !header.HasVersion() )
        header.Version = Constants::SAM_CURRENT_VERSION;

    string header_text = header.ToString();
    RefVector ref = reader.GetReferenceData();

    // set up alignments buffer
    BamAlignment al;
    vector<BamAlignment> buffer;
    buffer.reserve( (size_t)(SORT_DEFAULT_MAX_BUFFER_COUNT*1.1) );
    bool bufferFull = false;

	
    int buff_count = 0;
    // iterate through file
    while ( reader.GetNextAlignment(al)) {

        // check buffer's usage
        bufferFull = ( buffer.size() >= SORT_DEFAULT_MAX_BUFFER_COUNT );

        // store alignments until buffer is "full"
        if ( !bufferFull )
            buffer.push_back(al);
        // if buffer is "full"
        else {
            // so create a sorted temp file with current buffer contents
            // then push "al" into fresh buffer
            create_sorted_temp_file(buffer,
                                    out_file_name,
                                    buff_count,
                                    header_text,
                                    ref);
                                    ++buff_count;
            buffer.push_back(al);
        }
    }

    // handle any leftover buffer contents
    if ( !buffer.empty() ) {
        create_sorted_temp_file(buffer,
                                out_file_name,
                                buff_count,
                                header_text,
                                ref);

        ++buff_count;
    }

    reader.Close();

    return merge_sorted_files(out_file_name, buff_count, header_text, ref);

/*
	for (int i = 0; i < buff_count; ++i) {
    	stringstream temp_name;
    	temp_name << out_file_name << i;
	}
*/
}
Esempio n. 6
0
// generates mutiple sorted temp BAM files from single unsorted BAM file
bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
    
    // open input BAM file
    BamReader reader;
    if ( !reader.Open(m_settings->InputBamFilename) ) {
        cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
             << " for reading... Aborting." << endl;
        return false;
    }
    
    // get basic data that will be shared by all temp/output files 
    SamHeader header = reader.GetHeader();
    header.SortOrder = ( m_settings->IsSortingByName
                       ? Constants::SAM_HD_SORTORDER_QUERYNAME
                       : Constants::SAM_HD_SORTORDER_COORDINATE );
    m_headerText = header.ToString();
    m_references = reader.GetReferenceData();
    
    // set up alignments buffer
    BamAlignment al;
    vector<BamAlignment> buffer;
    buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) );
    bool bufferFull = false;
    
    // if sorting by name, we need to generate full char data
    // so can't use GetNextAlignmentCore()
    if ( m_settings->IsSortingByName ) {

        // iterate through file
        while ( reader.GetNextAlignment(al)) {

            // check buffer's usage
            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );

            // store alignments until buffer is "full"
            if ( !bufferFull )
                buffer.push_back(al);

            // if buffer is "full"
            else {

                // push any unmapped reads into buffer,
                // don't want to split these into a separate temp file
                if ( !al.IsMapped() )
                    buffer.push_back(al);

                // "al" is mapped, so create a sorted temp file with current buffer contents
                // then push "al" into fresh buffer
                else {
                    CreateSortedTempFile(buffer);
                    buffer.push_back(al);
                }
            }
        }
    }

    // sorting by position, can take advantage of GNACore() speedup
    else {

        // iterate through file
        while ( reader.GetNextAlignmentCore(al) ) {

            // check buffer's usage
            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );

            // store alignments until buffer is "full"
            if ( !bufferFull )
                buffer.push_back(al);

            // if buffer is "full"
            else {

                // push any unmapped reads into buffer,
                // don't want to split these into a separate temp file
                if ( !al.IsMapped() )
                    buffer.push_back(al);

                // "al" is mapped, so create a sorted temp file with current buffer contents
                // then push "al" into fresh buffer
                else {
                    CreateSortedTempFile(buffer);
                    buffer.push_back(al);
                }
            }
        }
    }

    // handle any leftover buffer contents
    if ( !buffer.empty() )
        CreateSortedTempFile(buffer);
    
    // close reader & return success
    reader.Close();
    return true;
}
Esempio n. 7
0
/*! \fn bool BamWriter::Open(const std::string& filename,
                             const SamHeader& samHeader,
                             const RefVector& referenceSequences)
    \brief Opens a BAM file for writing.

    This is an overloaded function.

    Will overwrite the BAM file if it already exists.

    \param filename           name of output BAM file
    \param samHeader          header data, wrapped in SamHeader object
    \param referenceSequences list of reference entries

    \return \c true if opened successfully
    \sa Close(), IsOpen(), BamReader::GetHeader(), BamReader::GetReferenceData()
*/
bool BamWriter::Open(const std::string& filename,
                     const SamHeader& samHeader,
                     const RefVector& referenceSequences)
{
    return d->Open(filename, samHeader.ToString(), referenceSequences);
}
Esempio n. 8
0
void RecalibrationHandler::ReadRecalibrationFromComments(const SamHeader &samHeader, const map<string, int> &max_flows_by_run_id) {


  if (not samHeader.HasComments())
    return;

  unsigned int num_parsing_errors = 0;
  // Read comment lines from Sam header
  for (unsigned int i_co=0; i_co<samHeader.Comments.size(); i_co++) {

    // There might be all sorts of comments in the file
    // therefore must find the unlikely magic code in the line before trying to parse
    string magic_code = "6d5b9d29ede5f176a4711d415d769108"; // md5hash "This uniquely identifies json comments for recalibration."

    if (samHeader.Comments[i_co].find(magic_code) == std::string::npos) {
      //cout << endl << "No magic code found in comment line "<< i_co <<endl;
      //cout << samHeader.Comments.at(i_co) << endl;
      continue;
    }

    // Parse recalibration Json object
    Json::Value recal_params(Json::objectValue);
    Json::Reader recal_reader;
    if (not recal_reader.parse(samHeader.Comments[i_co], recal_params)) {
      cerr << "Failed to parse recalibration comment line " << recal_reader.getFormattedErrorMessages() << endl;
      num_parsing_errors++;
      continue;
    }

    string my_block_key = recal_params["MasterKey"].asString();

    // Assumes that the MasterKey is written in the format <run_id>.block_X<x_offset>_Y<y_offset>
    int end_runid = my_block_key.find(".");
    int x_loc     = my_block_key.find("block_X")+7;
    int y_loc     = my_block_key.find("_Y");

    // glorified assembly language
    string runid = my_block_key.substr(0,end_runid);
    int x_coord = atoi(my_block_key.substr(x_loc,y_loc-x_loc).c_str());
    int y_coord = atoi(my_block_key.substr(y_loc+2, my_block_key.size()-y_loc+2).c_str());

    // Protection against not having a flow order for a specified recalibration run id
    std::map<string, int>::const_iterator n_flows = max_flows_by_run_id.find(runid);
    if (n_flows == max_flows_by_run_id.end()) {
      cerr << "TVC ERROR: Recalibration information found for run id " << runid
    	   << " but there is no matching read group with this run id in the bam header." << endl;
      exit(EXIT_FAILURE);
    }

    //recalModel.InitializeFromJSON(recal_params, my_block_key, false, max_flows_by_run_id.at(runid));
    // void RecalibrationModel::InitializeFromJSON(Json::Value &recal_params, string &my_block_key, bool spam_enabled, int over_flow_protect) {
    // The calibration comment line contains  info about the hp threshold used during base calling, so set to zero here
    // XXX FIXME: The number of flows in the TVC group can be larger than the one specified in the calibration block.
    recalModel.InitializeModelFromJson(recal_params, n_flows->second);
    bam_header_recalibration.insert(pair<string,LinearCalibrationModel>(my_block_key, recalModel));
    block_hash.insert(pair<string, pair<int,int > >(runid,pair<int,int>(x_coord,y_coord)));
    is_live = true; // found at least one recalibration entry
  }

  // Verbose output
  if (is_live){
    cout << "Recalibration was detected from comment lines in bam file(s):" << endl;
    cout << bam_header_recalibration.size() << " unique blocks of recalibration info detected." << endl;
  }
  if (num_parsing_errors > 0) {
    cout << "Failed to parse " << num_parsing_errors << " recalibration comment lines." << endl;
  }
}