Exemple #1
0
// build the read group library map
void Dedup_LowMem::buildReadGroupLibraryMap(SamFileHeader& header) {
    rgidLibMap.clear();
    numLibraries = 0;
    std::map<std::string,uint32_t> libNameMap;

    SamHeaderRecord * headerRecord = header.getNextRGRecord();
    while(headerRecord != NULL) {
        std::string ID = headerRecord->getTagValue("ID");
        std::string LB = headerRecord->getTagValue("LB");

        if ( ID.empty() ) {
            std::string headerRecordString;
            headerRecord->appendString(headerRecordString);
            Logger::gLogger->error("Cannot find readGroup ID information in the header line %s",
                                   headerRecordString.c_str());
        }
        if ( rgidLibMap.find(ID) != rgidLibMap.end() ) {
            Logger::gLogger->error("The readGroup ID %s is not a unique identifier",ID.c_str());
        }

        if ( LB.empty() ) {
            std::string headerRecordString;
            headerRecord->appendString(headerRecordString);
            Logger::gLogger->warning("Cannot find library information in the header line %s. Using empty string for library name",
                                     headerRecordString.c_str());
        }

        if ( libNameMap.find( LB ) != libNameMap.end() ) {
            rgidLibMap[ID] = libNameMap[LB];
        }
        else {
            numLibraries = libNameMap.size()+1;
            libNameMap[LB] = numLibraries;
            rgidLibMap[ID] = numLibraries;
        }
        headerRecord = header.getNextRGRecord();
    }

    if (numLibraries > 0xff) {
        Logger::gLogger->error("More than 255 library names are identified. Dedup_LowMem currently only allows up to 255 library names");
    }
}
Exemple #2
0
void parseOutRG(SamFileHeader& header, std::string& noRgPgString, SamFileHeader* newHeader)
{
    noRgPgString.clear();
    // strings for comparing if two RGs with same ID are the same.
    static std::string prevString = "";
    static std::string newString = "";

    SamHeaderRecord* rec = header.getNextHeaderRecord();
    while(rec != NULL)
    {
        if(rec->getType() == SamHeaderRecord::RG)
        {
            if(newHeader != NULL)
            {
                // This is an RG line.
                // First check if this RG is already included in the new header.
                SamHeaderRG* prevRG = newHeader->getRG(rec->getTagValue("ID"));
                
                if(prevRG != NULL)
                {
                    // This RG already exists, check that they are the same.
                    // If they are the same, there is nothing to do.
                    bool status = true;
                    prevString.clear();
                    newString.clear();
                    status &= prevRG->appendString(prevString);
                    status &= rec->appendString(newString);
                    if(prevString != newString)
                    {
                        // They are not identical, so report an error.
                        Logger::gLogger->error("Failed to add readgroup to header, "
                                               "duplicate, but non-identical RG ID, %s",
                                               rec->getTagValue("ID"));
                    }
                }
                else
                {
                    // This RG does not exist yet, so add it to the new header.
                    if(!newHeader->addRecordCopy((SamHeaderRG&)(*rec)))
                    {
                        // Failed to add the RG, exit.
                        Logger::gLogger->error("Failed to add readgroup to header, %s",
                                               newHeader->getErrorMessage());
                    }
                }
            }
        }
        else if(rec->getType() == SamHeaderRecord::PG)
        {
            if(newHeader != NULL)
            {
                // This is a PG line.
                // First check if this PG is already included in the new header.
                SamHeaderPG* prevPG = newHeader->getPG(rec->getTagValue("ID"));
                
                if(prevPG != NULL)
                {
                    // This PG already exists, check if they are the same.
                    // If they are the same, there is nothing to do.
                    bool status = true;
                    prevString.clear();
                    newString.clear();
                    status &= prevPG->appendString(prevString);
                    status &= rec->appendString(newString);
                    if(prevString != newString)
                    {
                        // They are not identical, ignore for now.
                        // TODO: change the ID, and add it.
                        Logger::gLogger->warning("Warning: dropping duplicate, "
                                                 "but non-identical PG ID, %s",
                                                 rec->getTagValue("ID"));
                    }
                }
                else
                {
                    // This PG does not exist yet, so add it to the new header.
                    if(!newHeader->addRecordCopy((SamHeaderPG&)(*rec)))
                    {
                        // Failed to add the PG, exit.
                        Logger::gLogger->error("Failed to add PG to header, %s",
                                               newHeader->getErrorMessage());
                    }
                }
            }
        }
        else
        {
            rec->appendString(noRgPgString);
        }
        rec = header.getNextHeaderRecord();
    }

    // Append the comments.
    header.appendCommentLines(noRgPgString);
}
Exemple #3
0
int main(int argc, char ** argv)
{
  gpLogger = new Logger;

  static struct option getopt_long_options[] = 
    {
      // Input options
      { "fasta", required_argument, NULL, 'f'},
      { "in", required_argument, NULL, 'i'},
      { "out", required_argument, NULL, 'o'},
      { "verbose", no_argument, NULL, 'v'},
      { "log", required_argument, NULL, 'l'},
      { "clear", no_argument, NULL, 0},
      { "AS", required_argument, NULL, 0},
      { "UR", required_argument, NULL, 0},
      { "SP", required_argument, NULL, 0},
      { "HD", required_argument, NULL, 0},
      { "RG", required_argument, NULL, 0},
      { "PG", required_argument, NULL, 0},
      { "checkSQ", no_argument, NULL, 0},
      { NULL, 0, NULL, 0 },
    };

  int n_option_index = 0, c;
  
  std::string sAS, sUR, sSP, sFasta, sInFile, sOutFile, sLogFile;
  bool bClear, bCheckSQ, bVerbose;
  std::vector<std::string> vsHDHeaders, vsRGHeaders, vsPGHeaders;

  bCheckSQ = bVerbose = false;
  bClear = true;

  while ( (c = getopt_long(argc, argv, "vf:i:o:l:", getopt_long_options, &n_option_index)) != -1 ) {
      //    std::cout << getopt_long_options[n_option_index].name << "\t" << optarg << std::endl;
    if ( c == 'f' ) {
      sFasta = optarg;
    }
    else if ( c == 'i' ) {
      sInFile = optarg;
    }
    else if ( c == 'o' ) {
      sOutFile = optarg;
    }
    else if ( c == 'v' ) {
      bVerbose = true;
    }
    else if ( c == 'l' ) {
	sLogFile = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"AS") == 0 ) {
      sAS = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"UR") == 0 ) {
      sUR = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"SP") == 0 ) {
      sSP = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"HD") == 0 ) {
      vsHDHeaders.push_back(optarg);
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"RG") == 0 ) {
      vsRGHeaders.push_back(optarg);
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"PG") == 0 ) {
      vsPGHeaders.push_back(optarg);
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"checkSQ") == 0 ) {
      bCheckSQ = true;
    }
    else {
      std::cerr << "Error: Unrecognized option " << getopt_long_options[n_option_index].name << std::endl;
      abort();
    }
  }

  if ( optind < argc ) {
    printUsage(std::cerr);
    gpLogger->error("non-option argument %s exist ",argv[optind]);
  }

  if ( sInFile.empty() || sOutFile.empty() ) {
    printUsage(std::cerr);
    gpLogger->error("Input and output files are required");
  }

  if ( sLogFile.compare("__NONE__") == 0 ) {
    sLogFile = (sOutFile + ".log");
  }

  gpLogger->open(sLogFile.c_str(), bVerbose);

  if ( ( bCheckSQ ) && ( sFasta.empty() ) ) {
    printUsage(std::cerr);
    gpLogger->error("--checkSQ option must be used with --fasta option");
  }

  // check whether each header line starts with a correct tag
  checkHeaderStarts(vsHDHeaders, "@HD\t");
  checkHeaderStarts(vsRGHeaders, "@RG\t");
  checkHeaderStarts(vsPGHeaders, "@PG\t");

  gpLogger->write_log("Arguments in effect:");
  gpLogger->write_log("\t--in [%s]",sInFile.c_str());
  gpLogger->write_log("\t--out [%s]",sOutFile.c_str());
  gpLogger->write_log("\t--log [%s]",sLogFile.c_str());
  gpLogger->write_log("\t--fasta [%s]",sFasta.c_str());
  gpLogger->write_log("\t--AS [%s]",sAS.c_str());
  gpLogger->write_log("\t--UR [%s]",sUR.c_str());
  gpLogger->write_log("\t--SP [%s]",sSP.c_str());
  gpLogger->write_log("\t--checkSQ [%s]",bClear ? "ON" : "OFF" );
  if ( vsHDHeaders.empty() ) {
    gpLogger->write_log("\t--HD []");
  }
  else {
    gpLogger->write_log("\t--HD [%s]",vsHDHeaders[0].c_str());
  }
  if ( vsRGHeaders.empty() ) {
    gpLogger->write_log("\t--RG []");
  }
  else {
    gpLogger->write_log("\t--RG [%s]",vsRGHeaders[0].c_str());
  }
  if ( vsPGHeaders.empty() ) {
    gpLogger->write_log("\t--PG []");
  }
  else {
    for(uint32_t i=0; i < vsPGHeaders.size(); ++i) {
      gpLogger->write_log("\t--PG [%s]",vsPGHeaders[i].c_str());
    }
  }

  if ( (vsHDHeaders.empty() ) && ( vsRGHeaders.empty() ) && ( vsPGHeaders.empty() ) && ( !bClear ) && ( sFasta.empty() ) ) {
    gpLogger->warning("No option is in effect for modifying BAM files. The input and output files will be identical");
  }

  if ( ( vsHDHeaders.size() > 1 ) || ( vsRGHeaders.size() > 1 ) ) {
    gpLogger->error("HD and RG headers cannot be multiple");
  }

  FastaFile fastaFile;
  if ( ! sFasta.empty() ) {
    if ( fastaFile.open(sFasta.c_str()) ) {
      gpLogger->write_log("Reading the reference file %s",sFasta.c_str());
      fastaFile.readThru();
      fastaFile.close();
      gpLogger->write_log("Finished reading the reference file %s",sFasta.c_str());      
    }
    else {
      gpLogger->error("Failed to open reference file %s",sFasta.c_str());
    }
  }

  SamFile samIn;
  SamFile samOut;

  if ( ! samIn.OpenForRead(sInFile.c_str()) ) {
    gpLogger->error("Cannot open BAM file %s for reading - %s",sInFile.c_str(), SamStatus::getStatusString(samIn.GetStatus()) );
  }
  if ( ! samOut.OpenForWrite(sOutFile.c_str()) ) {
    gpLogger->error("Cannot open BAM file %s for writing - %s",sOutFile.c_str(), SamStatus::getStatusString(samOut.GetStatus()) );
  }

  SamFileHeader samHeader;
  SamHeaderRecord* pSamHeaderRecord;
  samIn.ReadHeader(samHeader);

  // check the sanity of SQ file
  // make sure the SN and LN matches, with the same order
  if ( bCheckSQ ) {
    unsigned int numSQ = 0;
    while( (pSamHeaderRecord = samHeader.getNextHeaderRecord()) != NULL ) {
      if ( pSamHeaderRecord->getType() == SamHeaderRecord::SQ ) {
	++numSQ;
      }
    }

    if ( numSQ != fastaFile.vsSequenceNames.size() ) {
      gpLogger->error("# of @SQ tags are different from the original BAM and the reference file");
    }

    // iterator over all @SQ objects
    for(unsigned int i=0; i < numSQ; ++i) {
      pSamHeaderRecord = samHeader.getSQ(fastaFile.vsSequenceNames[i].c_str());
      if ( fastaFile.vsSequenceNames[i].compare(pSamHeaderRecord->getTagValue("SN")) != 0 ) {
	gpLogger->error("SequenceName is not identical between fasta and input BAM file");
      }
      else if ( static_cast<int>(fastaFile.vnSequenceLengths[i]) != atoi(pSamHeaderRecord->getTagValue("LN")) ) {
	gpLogger->error("SequenceLength is not identical between fasta and input BAM file");
      }
      else {
	if ( !sAS.empty() ) 
	  samHeader.setSQTag("AS",sAS.c_str(),fastaFile.vsSequenceNames[i].c_str());
	samHeader.setSQTag("M5",fastaFile.vsMD5sums[i].c_str(),fastaFile.vsSequenceNames[i].c_str());
	if ( !sUR.empty() ) 
	  samHeader.setSQTag("UR",sUR.c_str(),fastaFile.vsSequenceNames[i].c_str());
	if ( !sSP.empty() ) 
	  samHeader.setSQTag("SP",sSP.c_str(),fastaFile.vsSequenceNames[i].c_str());
      }
    }
    gpLogger->write_log("Finished checking the consistency of SQ tags");
  }
  else {
    gpLogger->write_log("Skipped checking the consistency of SQ tags");
  }

  // go over the headers again, 
  // assuming order of HD, SQ, RG, PG, and put proper tags at the end of the original tags

  gpLogger->write_log("Creating the header of new output file");
  //SamFileHeader outHeader;
  samHeader.resetHeaderRecordIter();

  for(unsigned int i=0; i < vsHDHeaders.size(); ++i) {
    samHeader.addHeaderLine(vsHDHeaders[i].c_str());
  }

  /*
  for(int i=0; i < fastaFile.vsSequenceNames.size(); ++i) {
    std::string s("@SQ\tSN:");
    char buf[1024];
    s += fastaFile.vsSequenceNames[i];
    sprintf(buf,"\tLN:%d",fastaFile.vnSequenceLengths[i]);
    s += buf;
    if ( !sAS.empty() ) {
      sprintf(buf,"\tAS:%s",sAS.c_str());
      s += buf;
    }
    if ( !sUR.empty() ) {
      sprintf(buf,"\tUR:%s",sUR.c_str());
      s += buf;
    }
    sprintf(buf,"\tM5:%s",fastaFile.vsMD5sums[i].c_str());
    s += buf;
    if ( !sSP.empty() ) {
      sprintf(buf,"\tSP:%s",sSP.c_str());
      s += buf;
    }
    outHeader.addHeaderLine(s.c_str());
    }*/

  for(unsigned int i=0; i < vsRGHeaders.size(); ++i) {
    samHeader.addHeaderLine(vsRGHeaders[i].c_str());
  }

  for(unsigned int i=0; i < vsPGHeaders.size(); ++i) {
    samHeader.addHeaderLine(vsPGHeaders[i].c_str());
  }

  samOut.WriteHeader(samHeader);
  gpLogger->write_log("Adding %d HD, %d RG, and %d PG headers",vsHDHeaders.size(), vsRGHeaders.size(), vsPGHeaders.size());
  gpLogger->write_log("Finished writing output headers");

  // parse RG tag and get RG ID to append
  std::string sRGID;
  if ( ! vsRGHeaders.empty() ) {
    std::vector<std::string> tokens;
    FastaFile::tokenizeString( vsRGHeaders[0].c_str(), tokens );
    for(unsigned int i=0; i < tokens.size(); ++i) {
      if ( tokens[i].find("ID:") == 0 ) {
	sRGID = tokens[i].substr(3);
      }
    }
  }
  
  gpLogger->write_log("Writing output BAM file");
  SamRecord samRecord;
  while (samIn.ReadRecord(samHeader, samRecord) == true) {
    if ( !sRGID.empty() ) {
      if ( samRecord.addTag("RG",'Z',sRGID.c_str()) == false ) {
	gpLogger->error("Failed to add a RG tag %s",sRGID.c_str());
      }
      // temporary code added
      if ( strncmp(samRecord.getReadName(),"seqcore_",8) == 0 ) {
	char buf[1024];
	sprintf(buf,"UM%s",samRecord.getReadName()+8);
	samRecord.setReadName(buf);
      }
    }
    samOut.WriteRecord(samHeader, samRecord);
    //if ( samIn.GetCurrentRecordCount() == 1000 ) break;
  }
  samOut.Close();
  gpLogger->write_log("Successfully written %d records",samIn.GetCurrentRecordCount());
  delete gpLogger;
  return 0;
}
Exemple #4
0
void parseOutRG(SamFileHeader& header, std::string& noRgPgString, SamFileHeader* newHeader, bool ignorePI)
{
    noRgPgString.clear();
    // strings for comparing if two RGs with same ID are the same.
    static std::string prevString = "";
    static std::string newString = "";

    SamHeaderRecord* rec = header.getNextHeaderRecord();
    while(rec != NULL)
    {
        if(rec->getType() == SamHeaderRecord::RG)
        {
            if(newHeader != NULL)
            {
                // This is an RG line.
                // First check if this RG is already included in the new header.
                SamHeaderRG* prevRG = newHeader->getRG(rec->getTagValue("ID"));
                
                if(prevRG != NULL)
                {
                    // This RG already exists, check that they are the same.
                    // If they are the same, there is nothing to do.
                    bool status = true;
                    prevString.clear();
                    newString.clear();
                    status &= prevRG->appendString(prevString);
                    status &= rec->appendString(newString);

                    if(prevString != newString)
                    {
                        if(!ignorePI)
                        {
                            Logger::gLogger->error("Failed to add readgroup to "
                                                   "header, duplicate, but "
                                                   "non-identical RG ID, %s\n"
                                                   "prev:\t(%s)\nnew:\t(%s)",
                                                   rec->getTagValue("ID"),
                                                   prevString.c_str(),
                                                   newString.c_str());
                        }
                        else
                        {
                            // Check for a PI string.
                            size_t prevPIStart = prevString.find("PI:");
                            size_t newPIStart = newString.find("PI:");

                            // If they are both npos, then PI was not found
                            // so fail.
                            if((prevPIStart == std::string::npos) &&
                               (newPIStart == std::string::npos))
                            {
                                // They are not identical, so report an error.
                                Logger::gLogger->error("Failed to add readgroup"
                                                       " to header, duplicate,"
                                                       " but non-identical RG"
                                                       " ID, %s\n"
                                                       "prev:\t(%s)\nnew:\t(%s)",
                                                       rec->getTagValue("ID"),
                                                       prevString.c_str(),
                                                       newString.c_str());
                            }
                            else
                            {
                                // PI found in one or both strings.
                                size_t prevPIEnd;
                                size_t newPIEnd;
                                if(prevPIStart == std::string::npos)
                                {
                                    // new string has PI, so compare to the start of that.
                                    prevPIStart = newPIStart;
                                    prevPIEnd = newPIStart;
                                }
                                else
                                {
                                    prevPIEnd = prevString.find('\t', prevPIStart) + 1;
                                }
                                if(newPIStart == std::string::npos)
                                {
                                    // new string has PI, so compare to the start of that.
                                    newPIStart = prevPIStart;
                                    newPIEnd = newPIStart;
                                }
                                else
                                {
                                    newPIEnd = newString.find('\t', newPIStart) + 1;
                                }
                                // Compare before PI.
                                if((newString.compare(0, newPIStart, prevString, 0, prevPIStart) != 0) ||
                                   (newString.compare(newPIEnd, std::string::npos, prevString,
                                                      prevPIEnd, std::string::npos) != 0))
                                {
                                    // They are not identical, so report an error.
                                    Logger::gLogger->error("Failed to add readgroup to header, "
                                                           "duplicate, but non-identical RG ID, %s, "
                                                           "even when ignoring PI\n"
                                                           "prev:\t(%s)\nnew:\t(%s)",
                                                           rec->getTagValue("ID"),
                                                           prevString.c_str(),
                                                           newString.c_str());
                                }
                                else
                                {
                                    Logger::gLogger->warning("Warning: ignoring non-identical PI field "
                                                             "for RG ID, %s",
                                                             rec->getTagValue("ID"));
                                }
                            }
                        }
                    }
                }
                else
                {
                    // This RG does not exist yet, so add it to the new header.
                    if(!newHeader->addRecordCopy((SamHeaderRG&)(*rec)))
                    {
                        // Failed to add the RG, exit.
                        Logger::gLogger->error("Failed to add readgroup to header, %s",
                                               newHeader->getErrorMessage());
                    }
                }
            }
        }
        else if(rec->getType() == SamHeaderRecord::PG)
        {
            if(newHeader != NULL)
            {
                // This is a PG line.
                // First check if this PG is already included in the new header.
                SamHeaderPG* prevPG = newHeader->getPG(rec->getTagValue("ID"));
                
                if(prevPG != NULL)
                {
                    // This PG already exists, check if they are the same.
                    // If they are the same, there is nothing to do.
                    bool status = true;
                    prevString.clear();
                    newString.clear();
                    status &= prevPG->appendString(prevString);
                    status &= rec->appendString(newString);
                    if(prevString != newString)
                    {
                        // They are not identical, ignore for now.
                        // TODO: change the ID, and add it.
                        Logger::gLogger->warning("Warning: dropping duplicate, "
                                                 "but non-identical PG ID, %s",
                                                 rec->getTagValue("ID"));
                    }
                }
                else
                {
                    // This PG does not exist yet, so add it to the new header.
                    if(!newHeader->addRecordCopy((SamHeaderPG&)(*rec)))
                    {
                        // Failed to add the PG, exit.
                        Logger::gLogger->error("Failed to add PG to header, %s",
                                               newHeader->getErrorMessage());
                    }
                }
            }
        }
        else
        {
            rec->appendString(noRgPgString);
        }
        rec = header.getNextHeaderRecord();
    }

    // Append the comments.
    header.appendCommentLines(noRgPgString);
}