Ejemplo n.º 1
0
void testAddHeaderAndTagToFile(const char* inputName, const char* outputName)
{
    SamFile inSam, outSam;
    assert(inSam.OpenForRead(inputName));
    assert(outSam.OpenForWrite(outputName));

    // Read the SAM Header.
    SamFileHeader samHeader;
    assert(inSam.ReadHeader(samHeader));

    // Add a header line.
    assert(samHeader.addHeaderLine("@RG\tID:myID\tSM:mySM") == false);
    assert(samHeader.addHeaderLine("@RG\tID:myID3\tSM:mySM") == true);

    // Write Header
    assert(outSam.WriteHeader(samHeader));

    SamRecord samRecord;
    assert(inSam.ReadRecord(samHeader, samRecord));
    //   validateRead1(samRecord);
    // Add two tags.
    assert(samRecord.addIntTag("XA", 123));
    assert(samRecord.addIntTag("XA", 456));
    assert(samRecord.addTag("RR", 'Z', "myID1"));
    assert(samRecord.addTag("RR", 'Z', "myID2"));

    // Write as Sam.
    assert(outSam.WriteRecord(samHeader, samRecord));

    // TODO, add test to verify it was written correctly.

    // Read a couple of records to make sure it properly can read them even
    // if they are bigger than the original.
    assert(inSam.ReadRecord(samHeader, samRecord));
    assert(inSam.ReadRecord(samHeader, samRecord));

    //  Check the MD tag, which requires the reference.
    GenomeSequence reference("testFiles/chr1_partial.fa");
    assert(SamTags::isMDTagCorrect(samRecord, reference) == false);
    String newMDTag;
    SamTags::createMDTag(newMDTag, samRecord, reference);
    assert(newMDTag == "2T1N0");
    assert(SamTags::updateMDTag(samRecord, reference));
    // Write as Sam.
    assert(outSam.WriteRecord(samHeader, samRecord));
}
Ejemplo n.º 2
0
int main(int argc, char ** argv)
{
  gpLogger = new Logger;

  static struct option getopt_long_options[] = 
    {
      // Input options
      { "fasta", required_argument, NULL, 'f'},
      { "in", required_argument, NULL, 'i'},
      { "out", required_argument, NULL, 'o'},
      { "verbose", no_argument, NULL, 'v'},
      { "log", required_argument, NULL, 'l'},
      { "clear", no_argument, NULL, 0},
      { "AS", required_argument, NULL, 0},
      { "UR", required_argument, NULL, 0},
      { "SP", required_argument, NULL, 0},
      { "HD", required_argument, NULL, 0},
      { "RG", required_argument, NULL, 0},
      { "PG", required_argument, NULL, 0},
      { "checkSQ", no_argument, NULL, 0},
      { NULL, 0, NULL, 0 },
    };

  int n_option_index = 0, c;
  
  std::string sAS, sUR, sSP, sFasta, sInFile, sOutFile, sLogFile;
  bool bClear, bCheckSQ, bVerbose;
  std::vector<std::string> vsHDHeaders, vsRGHeaders, vsPGHeaders;

  bCheckSQ = bVerbose = false;
  bClear = true;

  while ( (c = getopt_long(argc, argv, "vf:i:o:l:", getopt_long_options, &n_option_index)) != -1 ) {
      //    std::cout << getopt_long_options[n_option_index].name << "\t" << optarg << std::endl;
    if ( c == 'f' ) {
      sFasta = optarg;
    }
    else if ( c == 'i' ) {
      sInFile = optarg;
    }
    else if ( c == 'o' ) {
      sOutFile = optarg;
    }
    else if ( c == 'v' ) {
      bVerbose = true;
    }
    else if ( c == 'l' ) {
	sLogFile = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"AS") == 0 ) {
      sAS = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"UR") == 0 ) {
      sUR = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"SP") == 0 ) {
      sSP = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"HD") == 0 ) {
      vsHDHeaders.push_back(optarg);
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"RG") == 0 ) {
      vsRGHeaders.push_back(optarg);
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"PG") == 0 ) {
      vsPGHeaders.push_back(optarg);
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"checkSQ") == 0 ) {
      bCheckSQ = true;
    }
    else {
      std::cerr << "Error: Unrecognized option " << getopt_long_options[n_option_index].name << std::endl;
      abort();
    }
  }

  if ( optind < argc ) {
    printUsage(std::cerr);
    gpLogger->error("non-option argument %s exist ",argv[optind]);
  }

  if ( sInFile.empty() || sOutFile.empty() ) {
    printUsage(std::cerr);
    gpLogger->error("Input and output files are required");
  }

  if ( sLogFile.compare("__NONE__") == 0 ) {
    sLogFile = (sOutFile + ".log");
  }

  gpLogger->open(sLogFile.c_str(), bVerbose);

  if ( ( bCheckSQ ) && ( sFasta.empty() ) ) {
    printUsage(std::cerr);
    gpLogger->error("--checkSQ option must be used with --fasta option");
  }

  // check whether each header line starts with a correct tag
  checkHeaderStarts(vsHDHeaders, "@HD\t");
  checkHeaderStarts(vsRGHeaders, "@RG\t");
  checkHeaderStarts(vsPGHeaders, "@PG\t");

  gpLogger->write_log("Arguments in effect:");
  gpLogger->write_log("\t--in [%s]",sInFile.c_str());
  gpLogger->write_log("\t--out [%s]",sOutFile.c_str());
  gpLogger->write_log("\t--log [%s]",sLogFile.c_str());
  gpLogger->write_log("\t--fasta [%s]",sFasta.c_str());
  gpLogger->write_log("\t--AS [%s]",sAS.c_str());
  gpLogger->write_log("\t--UR [%s]",sUR.c_str());
  gpLogger->write_log("\t--SP [%s]",sSP.c_str());
  gpLogger->write_log("\t--checkSQ [%s]",bClear ? "ON" : "OFF" );
  if ( vsHDHeaders.empty() ) {
    gpLogger->write_log("\t--HD []");
  }
  else {
    gpLogger->write_log("\t--HD [%s]",vsHDHeaders[0].c_str());
  }
  if ( vsRGHeaders.empty() ) {
    gpLogger->write_log("\t--RG []");
  }
  else {
    gpLogger->write_log("\t--RG [%s]",vsRGHeaders[0].c_str());
  }
  if ( vsPGHeaders.empty() ) {
    gpLogger->write_log("\t--PG []");
  }
  else {
    for(uint32_t i=0; i < vsPGHeaders.size(); ++i) {
      gpLogger->write_log("\t--PG [%s]",vsPGHeaders[i].c_str());
    }
  }

  if ( (vsHDHeaders.empty() ) && ( vsRGHeaders.empty() ) && ( vsPGHeaders.empty() ) && ( !bClear ) && ( sFasta.empty() ) ) {
    gpLogger->warning("No option is in effect for modifying BAM files. The input and output files will be identical");
  }

  if ( ( vsHDHeaders.size() > 1 ) || ( vsRGHeaders.size() > 1 ) ) {
    gpLogger->error("HD and RG headers cannot be multiple");
  }

  FastaFile fastaFile;
  if ( ! sFasta.empty() ) {
    if ( fastaFile.open(sFasta.c_str()) ) {
      gpLogger->write_log("Reading the reference file %s",sFasta.c_str());
      fastaFile.readThru();
      fastaFile.close();
      gpLogger->write_log("Finished reading the reference file %s",sFasta.c_str());      
    }
    else {
      gpLogger->error("Failed to open reference file %s",sFasta.c_str());
    }
  }

  SamFile samIn;
  SamFile samOut;

  if ( ! samIn.OpenForRead(sInFile.c_str()) ) {
    gpLogger->error("Cannot open BAM file %s for reading - %s",sInFile.c_str(), SamStatus::getStatusString(samIn.GetStatus()) );
  }
  if ( ! samOut.OpenForWrite(sOutFile.c_str()) ) {
    gpLogger->error("Cannot open BAM file %s for writing - %s",sOutFile.c_str(), SamStatus::getStatusString(samOut.GetStatus()) );
  }

  SamFileHeader samHeader;
  SamHeaderRecord* pSamHeaderRecord;
  samIn.ReadHeader(samHeader);

  // check the sanity of SQ file
  // make sure the SN and LN matches, with the same order
  if ( bCheckSQ ) {
    unsigned int numSQ = 0;
    while( (pSamHeaderRecord = samHeader.getNextHeaderRecord()) != NULL ) {
      if ( pSamHeaderRecord->getType() == SamHeaderRecord::SQ ) {
	++numSQ;
      }
    }

    if ( numSQ != fastaFile.vsSequenceNames.size() ) {
      gpLogger->error("# of @SQ tags are different from the original BAM and the reference file");
    }

    // iterator over all @SQ objects
    for(unsigned int i=0; i < numSQ; ++i) {
      pSamHeaderRecord = samHeader.getSQ(fastaFile.vsSequenceNames[i].c_str());
      if ( fastaFile.vsSequenceNames[i].compare(pSamHeaderRecord->getTagValue("SN")) != 0 ) {
	gpLogger->error("SequenceName is not identical between fasta and input BAM file");
      }
      else if ( static_cast<int>(fastaFile.vnSequenceLengths[i]) != atoi(pSamHeaderRecord->getTagValue("LN")) ) {
	gpLogger->error("SequenceLength is not identical between fasta and input BAM file");
      }
      else {
	if ( !sAS.empty() ) 
	  samHeader.setSQTag("AS",sAS.c_str(),fastaFile.vsSequenceNames[i].c_str());
	samHeader.setSQTag("M5",fastaFile.vsMD5sums[i].c_str(),fastaFile.vsSequenceNames[i].c_str());
	if ( !sUR.empty() ) 
	  samHeader.setSQTag("UR",sUR.c_str(),fastaFile.vsSequenceNames[i].c_str());
	if ( !sSP.empty() ) 
	  samHeader.setSQTag("SP",sSP.c_str(),fastaFile.vsSequenceNames[i].c_str());
      }
    }
    gpLogger->write_log("Finished checking the consistency of SQ tags");
  }
  else {
    gpLogger->write_log("Skipped checking the consistency of SQ tags");
  }

  // go over the headers again, 
  // assuming order of HD, SQ, RG, PG, and put proper tags at the end of the original tags

  gpLogger->write_log("Creating the header of new output file");
  //SamFileHeader outHeader;
  samHeader.resetHeaderRecordIter();

  for(unsigned int i=0; i < vsHDHeaders.size(); ++i) {
    samHeader.addHeaderLine(vsHDHeaders[i].c_str());
  }

  /*
  for(int i=0; i < fastaFile.vsSequenceNames.size(); ++i) {
    std::string s("@SQ\tSN:");
    char buf[1024];
    s += fastaFile.vsSequenceNames[i];
    sprintf(buf,"\tLN:%d",fastaFile.vnSequenceLengths[i]);
    s += buf;
    if ( !sAS.empty() ) {
      sprintf(buf,"\tAS:%s",sAS.c_str());
      s += buf;
    }
    if ( !sUR.empty() ) {
      sprintf(buf,"\tUR:%s",sUR.c_str());
      s += buf;
    }
    sprintf(buf,"\tM5:%s",fastaFile.vsMD5sums[i].c_str());
    s += buf;
    if ( !sSP.empty() ) {
      sprintf(buf,"\tSP:%s",sSP.c_str());
      s += buf;
    }
    outHeader.addHeaderLine(s.c_str());
    }*/

  for(unsigned int i=0; i < vsRGHeaders.size(); ++i) {
    samHeader.addHeaderLine(vsRGHeaders[i].c_str());
  }

  for(unsigned int i=0; i < vsPGHeaders.size(); ++i) {
    samHeader.addHeaderLine(vsPGHeaders[i].c_str());
  }

  samOut.WriteHeader(samHeader);
  gpLogger->write_log("Adding %d HD, %d RG, and %d PG headers",vsHDHeaders.size(), vsRGHeaders.size(), vsPGHeaders.size());
  gpLogger->write_log("Finished writing output headers");

  // parse RG tag and get RG ID to append
  std::string sRGID;
  if ( ! vsRGHeaders.empty() ) {
    std::vector<std::string> tokens;
    FastaFile::tokenizeString( vsRGHeaders[0].c_str(), tokens );
    for(unsigned int i=0; i < tokens.size(); ++i) {
      if ( tokens[i].find("ID:") == 0 ) {
	sRGID = tokens[i].substr(3);
      }
    }
  }
  
  gpLogger->write_log("Writing output BAM file");
  SamRecord samRecord;
  while (samIn.ReadRecord(samHeader, samRecord) == true) {
    if ( !sRGID.empty() ) {
      if ( samRecord.addTag("RG",'Z',sRGID.c_str()) == false ) {
	gpLogger->error("Failed to add a RG tag %s",sRGID.c_str());
      }
      // temporary code added
      if ( strncmp(samRecord.getReadName(),"seqcore_",8) == 0 ) {
	char buf[1024];
	sprintf(buf,"UM%s",samRecord.getReadName()+8);
	samRecord.setReadName(buf);
      }
    }
    samOut.WriteRecord(samHeader, samRecord);
    //if ( samIn.GetCurrentRecordCount() == 1000 ) break;
  }
  samOut.Close();
  gpLogger->write_log("Successfully written %d records",samIn.GetCurrentRecordCount());
  delete gpLogger;
  return 0;
}
Ejemplo n.º 3
0
bool Recab::processReadApplyTable(SamRecord& samRecord)
{
    static BaseData data;
    static std::string readGroup;
    static std::string aligTypes;

    int seqLen = samRecord.getReadLength();

    uint16_t  flag = samRecord.getFlag();

    // Check if the flag contains an exclude.
    if((flag & myIntApplyExcludeFlags) != 0)
    {
        // Do not apply the recalibration table to this read.
        ++myNumApplySkipped;
        return(false);
    }
    ++myNumApplyReads;
   
    readGroup = samRecord.getString("RG").c_str();

    // Look for the read group in the map.
    // TODO - extra string constructor??
    RgInsertReturn insertRet = 
        myRg2Id.insert(std::pair<std::string, uint16_t>(readGroup, 0));
    if(insertRet.second == true)
    {
        // New element inserted.
        insertRet.first->second = myId2Rg.size();
        myId2Rg.push_back(readGroup);
    }

    data.rgid = insertRet.first->second;

    if(!myQField.IsEmpty())
    {
        // Check if there is an old quality.
        const String* oldQPtr =
            samRecord.getStringTag(myQField.c_str());
        if((oldQPtr != NULL) && (oldQPtr->Length() == seqLen))
        {
            // There is an old quality, so use that.
            myQualityStrings.oldq = oldQPtr->c_str();
        }
        else
        {
            myQualityStrings.oldq = samRecord.getQuality();
        }
    }
    else
    {
        myQualityStrings.oldq = samRecord.getQuality();
    }

    if(myQualityStrings.oldq.length() != (unsigned int)seqLen)
    {
        Logger::gLogger->warning("Quality is not the correct length, so skipping recalibration on that record.");
        return(false);
    }

    myQualityStrings.newq.resize(seqLen);

    ////////////////
    ////// iterate sequence
    ////////////////
    int32_t seqPos = 0;
    int seqIncr = 1;

    bool reverse;
    if(SamFlag::isReverse(flag))
    {
        reverse = true;
        seqPos = seqLen - 1;
        seqIncr = -1;
    }
    else
        reverse = false;

    // Check which read - this will be the same for all positions, so 
    // do this outside of the smaller loop.
    if(!SamFlag::isPaired(flag) || SamFlag::isFirstFragment(flag))
        // Mark as first if it is not paired or if it is the
        // first in the pair.
        data.read = 0;
    else
        data.read = 1;

    // Set unsetbase for curBase.
    // This will be used for the prebase of cycle 0.
    data.curBase = 'K';

    for (data.cycle = 0; data.cycle < seqLen; data.cycle++, seqPos += seqIncr)
    {
        // Set the preBase to the previous cycle's current base.
        // For cycle 0, curBase was set to a default value.
        data.preBase = data.curBase;

        // Get the current base.
        data.curBase = samRecord.getSequence(seqPos);

        if(reverse)
        {
            // Complement the current base.
            data.curBase =
                BaseAsciiMap::base2complement[(unsigned int)(data.curBase)];
        }

        // Get quality
        data.qual = 
            BaseUtilities::getPhredBaseQuality(myQualityStrings.oldq[seqPos]);

        // skip bases with quality below the minimum set.
        if(data.qual < myMinBaseQual)
        {
            myQualityStrings.newq[seqPos] = myQualityStrings.oldq[seqPos];
            continue;
        }

        // Update quality score
        uint8_t qemp = hasherrormodel.getQemp(data);
        qemp = mySqueeze.getQualCharFromQemp(qemp);
        if(qemp > myMaxBaseQualChar)
        {
            qemp = myMaxBaseQualChar;
        }
        myQualityStrings.newq[seqPos] = qemp;
    }

    if(!myStoreQualTag.IsEmpty())
    {
        samRecord.addTag(myStoreQualTag, 'Z', myQualityStrings.oldq.c_str());
    }
    samRecord.setQuality(myQualityStrings.newq.c_str());

    return true;
}
Ejemplo n.º 4
0
// add a readgroup tag to each SamRecord
void addReadGroupTag(SamRecord& record, ReadGroup& rg) {
  if ( record.addTag("RG",'Z',rg.s_id.c_str()) == false ) {
    Logger::gLogger->error("Failed to add readgroup tag %s",rg.s_id.c_str());
  }
}