Ejemplo n.º 1
0
void Bam2FastQ::handlePairedRN(SamRecord& samRec)
{
    static SamRecord* prevRec = NULL;
    static std::string prevRN = "";

    if(prevRec == NULL)
    {
        prevRec = &samRec;
    }
    else
    {
        if(strcmp(prevRec->getReadName(), samRec.getReadName()) != 0)
        {
            // Read Name does not match, error, did not find pair.
            std::cerr << "Paired Read, " << prevRec->getReadName()
                      << " but couldn't find mate, so writing as "
                      << "unpaired (single-ended)\n";
            ++myNumMateFailures;
            writeFastQ(*prevRec, myUnpairedFile,
                       myUnpairedFileNameExt);
            // Save this record to check against the next one.
            prevRec = &samRec;
        }
        else
        {
            // Matching ReadNames.
            // Found the mate.
            ++myNumPairs;
            // Check which is the first in the pair.
            if(SamFlag::isFirstFragment(samRec.getFlag()))
            {
                if(SamFlag::isFirstFragment(prevRec->getFlag()))
                {
                    std::cerr << "Both reads of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(samRec, myFirstFile, myFirstFileNameExt,
                           myFirstRNExt.c_str());
                writeFastQ(*prevRec, mySecondFile, mySecondFileNameExt,
                           mySecondRNExt.c_str());
            }
            else
            {
                if(!SamFlag::isFirstFragment(prevRec->getFlag()))
                {
                    std::cerr << "Neither read of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(*prevRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str());
                writeFastQ(samRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str());
            }
            // No previous record.
            prevRec = NULL;
        }
    }
}
Ejemplo n.º 2
0
void Demux::match(){
	BamReader bamReader(bamFilePath);
	SamRecord samRecord;
	while (	bamReader.getNextRecord(samRecord)) {
		string recordName(samRecord.getReadName());
		recordName = cHandler.decrypt(recordName);
		//clean record name
		int len=recordName.find("$");
		recordName=recordName.substr(0,len);
		//clean ended
		if (!isDecoy(recordName)){
			string outputFile = generateFileName(recordName);
			printf("%s\n",outputFile.c_str());
			if (writers.find(outputFile) == writers.end()) //if the BamWriter is not initialized
				writers[outputFile] = new BamWriter(outputFile, bamReader.getHeader());
			BamWriter *writer = writers[outputFile];
			writer->writeRecord(samRecord);
		}
    }
    for(auto it=writers.begin();it!=writers.end();it++){
    	BamWriter *writer = it->second;
    	writer->close();
    	delete writer;
    }
}
Ejemplo n.º 3
0
void Bam2FastQ::writeFastQ(SamRecord& samRec, IFILE filePtr,
                             const char* readNameExt)
{
    static int16_t flag;
    static std::string sequence;
    static String quality;

    if(filePtr == NULL)
    {
        return;
    }

    flag = samRec.getFlag();
    const char* readName = samRec.getReadName();
    sequence = samRec.getSequence();
    quality = samRec.getQuality();
    
    if(SamFlag::isReverse(flag) && myReverseComp)
    {
        // It is reverse, so reverse compliment the sequence
        BaseUtilities::reverseComplement(sequence);
        // Reverse the quality.
        quality.Reverse();
    }
    else
    {
        // Ensure it is all capitalized.
        int seqLen = sequence.size();
        for (int i = 0; i < seqLen; i++)
        {
            sequence[i] = (char)toupper(sequence[i]);
        }
    }
    
    if(myRNPlus)
    {

        ifprintf(filePtr, "@%s%s\n%s\n+%s%s\n%s\n", readName, readNameExt,
                 sequence.c_str(), readName, readNameExt, quality.c_str());
    }
    else
    {
        ifprintf(filePtr, "@%s%s\n%s\n+\n%s\n", readName, readNameExt,
                 sequence.c_str(), quality.c_str());
    }
    // Release the record.
    myPool.releaseRecord(&samRec);
}
Ejemplo n.º 4
0
bool Demux::verify(){ //at least one decoy should be present
	BamReader bamReader(bamFilePath);
	SamRecord samRecord;
	int verifiedDecoys = 0;
	while (	bamReader.getNextRecord(samRecord)) {
		string recordName(samRecord.getReadName());
		recordName = cHandler.decrypt(recordName);
		//clean record name
		int len=recordName.find("$");
		recordName=recordName.substr(0,len);
		//clean ended
		if (isDecoy(recordName)){
			char decoyChromosome[2][20];
			int decoyLocation[2];
			int decoyJobID;

			sscanf(	recordName.c_str(),"DECOY.%[^.].%d_%[^.].%d.%d",
					decoyChromosome[0], decoyLocation, 
					decoyChromosome[1], decoyLocation + 1,  &decoyJobID);

			int pair = 	!SamFlag::isFirstFragment(samRecord.getFlag());
			if (strcmp(decoyChromosome[pair],samRecord.getReferenceName())){
				cout << "Chromosome mismatch\n";
				cout << "Expected: " << decoyChromosome[pair] << endl;
				cout << "Got: " << samRecord.getReferenceName() << endl;
				return false;
			}
			if ( decoyLocation[pair] != samRecord.get0BasedPosition()){
				cout << "Position mismatch\n";
				cout << "Expected: " << decoyLocation[pair] << endl;
				cout << "Got: " << samRecord.get0BasedPosition() << endl;
				return false;
			}
			if (decoyJobID != jobID){
				cout << "Job ID mismatch\n";
				cout << "Expected: " << decoyJobID << endl;
				cout << "Got: " << jobID << endl;
				return false;
			}
			verifiedDecoys++;
		}
    }
    if (verifiedDecoys)
		return true;
	else
		return false;
}
Ejemplo n.º 5
0
// get the libraryID of a record
uint32_t Dedup_LowMem::getLibraryID(SamRecord& record, bool checkTags) {
    if ( ( checkTags == false ) && ( numLibraries <= 1 ) ) {
        return 0;
    } else {
        char tag[3];
        char vtype;
        void* value;
        std::string rgID;
        record.resetTagIter();
        while( record.getNextSamTag(tag,vtype,&value) != false ) {
            if ( ( tag[0] == 'R' ) && ( tag[1] == 'G' ) && ( vtype == 'Z' ) ) {
                if ( !rgID.empty() ) {
                    Logger::gLogger->error("Multiple RG tag found in one record. ReadName is %s",record.getReadName());
                }
                else if ( record.isStringType(vtype) ) {
                    String s = (String)*(String*)value;
                    rgID = s.c_str();
                }
                else {
                    Logger::gLogger->error("vtype is not string (Z) for RG tag");
                }
            }
        }
        if ( rgID.empty() ) {
            Logger::gLogger->error("No RG tag is found in read %s",record.getReadName());
            return 0;
        }
        else {
            std::map<std::string,uint32_t>::iterator it = rgidLibMap.find(rgID);
            if ( it != rgidLibMap.end() ) {
                return it->second;
            }
            else {
                Logger::gLogger->warning("RG tag %s does not exist in the header",rgID.c_str());
                return 0; // cannot be reached
            }
        }
    }
}
Ejemplo n.º 6
0
// When a record is read, check if it is a duplicate or
// store for future checking.
void Dedup_LowMem::checkDups(SamRecord& record, uint32_t recordCount)
{
    // Only inside this method if the record is mapped.

    // Get the key for this record.
    static DupKey key;
    key.initKey(record, getLibraryID(record));

    int flag = record.getFlag();
    bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
    int sumBaseQual = getBaseQuality(record);

    int32_t chromID = record.getReferenceID();
    int32_t mateChromID = record.getMateReferenceID();

    // If we are one-chrom and the mate is not on the same chromosome,
    // mark it as not paired.
    if(myOneChrom && (chromID != mateChromID))
    {
        recordPaired = false;
    }

    // Look in the fragment map to see if an entry for this key exists.
    FragmentMapInsertReturn ireturn =
        myFragmentMap.insert(std::make_pair(key, FragData()));

    FragData* fragData = &(ireturn.first->second);

    // Enter the new record in the fragData if (any of the below):
    // 1) there is no previous entry for this key (ireturn.second == true)
    // or
    // 2) the previous entry is not paired
    //    AND
    //     a) the new record is paired
    //     or
    //     b) the new record has higher quality
    if((ireturn.second == true) ||
            ((fragData->paired == false) &&
             (recordPaired || (sumBaseQual > fragData->sumBaseQual))))
    {
        // Check if this is a new key.
        if(ireturn.second == true)
        {
            // New entry, so build the recalibration table now.
            if(myDoRecab)
            {
                myRecab.processReadBuildTable(record);
            }
        }
        else if(fragData->paired == false)
        {
            // There was a previous record and it is not paired,
            // so mark it as a duplicate.
            // Duplicate checking/marking for pairs is handled below.
            handleDuplicate(fragData->recordIndex);
        }

        // Store this record for later duplicate checking.
        fragData->sumBaseQual = sumBaseQual;
        fragData->recordIndex = recordCount;
        fragData->paired = recordPaired;
    }
    else
    {
        // Leave the old record in fragData.
        // If the new record is not paired, handle it as a duplicate.
        if(recordPaired == false)
        {
            // This record is a duplicate, so mark it and release it.
            handleDuplicate(recordCount);
        }
    }

    // Only paired processing is left, so return if not paired.
    if(recordPaired == false)
    {
        // Not paired, no more operations required, so return.
        return;
    }

    // This is a paired record, so check for its mate.
    uint64_t readPos =
        SamHelper::combineChromPos(chromID,
                                   record.get0BasedPosition());
    uint64_t matePos =
        SamHelper::combineChromPos(mateChromID,
                                   record.get0BasedMatePosition());
    int mateIndex = -1;
    MateData* mateData = NULL;

    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate map is stored by the mate position, so look for this
        // record's position.
        // The mate should be in the mate map, so find it.
        std::pair<MateMap::iterator,MateMap::iterator> matches =
            myMateMap.equal_range(readPos);
        // Loop through the elements that matched the pos looking for the mate.
        for(MateMap::iterator iter = matches.first;
                iter != matches.second; iter++)
        {
            if(strcmp((*iter).second.readName.c_str(),
                      record.getReadName()) == 0)
            {
                // Found the match.
                mateData = &((*iter).second);
                // Update the quality and track the mate record and index.
                sumBaseQual += mateData->sumBaseQual;
                mateIndex = mateData->recordIndex;
                // Remove the entry from the map.
                myMateMap.erase(iter);
                break;
            }
        }
    }
    if(mateData == NULL)
    {
        if(matePos >= readPos)
        {
            // Haven't gotten to the mate yet, so store this record.
            MateMap::iterator mateIter =
                myMateMap.insert(std::make_pair(matePos, MateData()));
            mateIter->second.sumBaseQual = sumBaseQual;
            mateIter->second.recordIndex = recordCount;
            mateIter->second.key.copy(key);
            mateIter->second.readName = record.getReadName();
        }
        else
        {
            // Passed the mate, but it was not found.
            handleMissingMate(record.getReferenceID(), record.getMateReferenceID());
        }
        return;
    }

    // Make the paired key.
    PairedKey pkey(key, mateData->key);

    // Check to see if this pair is a duplicate.
    PairedMapInsertReturn pairedReturn =
        myPairedMap.insert(std::make_pair(pkey,PairedData()));
    PairedData* storedPair = &(pairedReturn.first->second);

    // Get the index for "record 1" - the one with the earlier coordinate.
    int record1Index = getFirstIndex(key, recordCount,
                                     mateData->key, mateIndex);

    // Check if we have already found a duplicate pair.
    // If there is no duplicate found, there is nothing more to do.
    if(pairedReturn.second == false)
    {
        // Duplicate found.
        bool keepStored = true;
        if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
        {
            // The new pair has higher quality, so keep that.
            keepStored = false;
        }
        else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
        {
            // Same quality, so keep the one with the earlier record1Index.
            if(record1Index < storedPair->record1Index)
            {
                // The new pair has an earlier lower coordinate read,
                // so keep that.
                keepStored = false;
            }
        }
        // Check to see which one should be kept by checking qualities.
        if(keepStored)
        {
            // The old pair had higher quality so mark the new pair as a
            // duplicate and release them.
            handleDuplicate(mateIndex);
            handleDuplicate(recordCount);
        }
        else
        {
            // The new pair has higher quality, so keep that.
            // First mark the previous one as duplicates and release them.
            handleDuplicate(storedPair->record1Index);
            handleDuplicate(storedPair->record2Index);
            // Store this pair's information.
            if(record1Index == mateIndex)
            {
                // Mate has a lower coordinate, so make mate
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Index = mateIndex;
                storedPair->record2Index = recordCount;
            }
            else
            {
                // This record has a lower coordinate, so make it
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Index = recordCount;
                storedPair->record2Index = mateIndex;
            }
        }
    }
    else
    {
        // Store this pair's information.
        storedPair->sumBaseQual = sumBaseQual;

        if(record1Index == mateIndex)
        {
            // Mate has a lower coordinate, so make mate
            // record1.
            storedPair->record1Index = mateIndex;
            storedPair->record2Index = recordCount;
        }
        else
        {
            // This record has a lower coordinate, so make it
            // record1.
            storedPair->record1Index = recordCount;
            storedPair->record2Index = mateIndex;
        }
    }
}
Ejemplo n.º 7
0
void Bam2FastQ::writeFastQ(SamRecord& samRec, IFILE filePtr,
                           const std::string& fileNameExt, const char* readNameExt)
{
    static int16_t flag;
    static std::string sequence;
    static String quality;
    static std::string rg;
    static std::string rgFastqExt;
    static std::string rgListStr;
    static std::string fileName;
    static std::string fq2;
    if(mySplitRG)
    {
        rg = samRec.getString("RG").c_str();
        rgFastqExt = rg + fileNameExt;

        OutFastqMap::iterator it;
        it = myOutFastqs.find(rgFastqExt);
        if(it == myOutFastqs.end())
        {
            // New file.
            fileName = myOutBase.c_str();
            if(rg != "")
            {
                fileName += '.';
            }
            else
            {
                rg = ".";
            }
            fileName += rgFastqExt;
            filePtr = ifopen(fileName.c_str(), "w", myCompression);
            myOutFastqs[rgFastqExt] = filePtr;

            if(fileNameExt != mySecondFileNameExt)
            {
                // first end.
                const char* sm = mySamHeader.getRGTagValue("SM", rg.c_str());
                if(strcmp(sm, "") == 0){sm = myOutBase.c_str();}

                rgListStr.clear();
                SamHeaderRG* rgPtr = mySamHeader.getRG(rg.c_str());
                if((rgPtr == NULL) || (!rgPtr->appendString(rgListStr)))
                {
                    // No RG info for this record.
                    rgListStr = ".\n";
                }
                fq2 = ".";
                if(fileNameExt == myFirstFileNameExt)
                {
                    fq2 = myOutBase.c_str();
                    if(rg != ".")
                    {
                        fq2 += '.';
                        fq2 += rg;
                    }
                    fq2 += mySecondFileNameExt;
                }
                ifprintf(myFqList, "%s\t%s\t%s\t%s",
                         sm, fileName.c_str(), fq2.c_str(),
                         rgListStr.c_str());
            }
        }
        else
        {
            filePtr = it->second;
        }
    }
    if(filePtr == NULL)
    {
        throw(std::runtime_error("Programming ERROR/EXITING: Bam2FastQ filePtr not set."));
        return;
    }

    flag = samRec.getFlag();
    const char* readName = samRec.getReadName();
    sequence = samRec.getSequence();
    if(myQField.IsEmpty())
    {
        // Read the quality from the quality field
        quality = samRec.getQuality();
    }
    else
    {
        // Read Quality from the specified tag
        const String* qTagPtr = samRec.getStringTag(myQField.c_str());
        if((qTagPtr != NULL) && (qTagPtr->Length() == (int)sequence.length()))
        {
            // Use the tag value for quality
            quality = qTagPtr->c_str();
        }
        else
        {
            // Tag was not found, so use the quality field.
            ++myNumQualTagErrors;
            if(myNumQualTagErrors == 1)
            {
                std::cerr << "Bam2FastQ: " << myQField.c_str() 
                          << " tag was not found/invalid, so using the quality field in records without the tag\n";
            }
            quality = samRec.getQuality();
        }
    }
    
    if(SamFlag::isReverse(flag) && myReverseComp)
    {
        // It is reverse, so reverse compliment the sequence
        BaseUtilities::reverseComplement(sequence);
        // Reverse the quality.
        quality.Reverse();
    }
    else
    {
        // Ensure it is all capitalized.
        int seqLen = sequence.size();
        for (int i = 0; i < seqLen; i++)
        {
            sequence[i] = (char)toupper(sequence[i]);
        }
    }
    
    if(myRNPlus)
    {

        ifprintf(filePtr, "@%s%s\n%s\n+%s%s\n%s\n", readName, readNameExt,
                 sequence.c_str(), readName, readNameExt, quality.c_str());
    }
    else
    {
        ifprintf(filePtr, "@%s%s\n%s\n+\n%s\n", readName, readNameExt,
                 sequence.c_str(), quality.c_str());
    }
    // Release the record.
    myPool.releaseRecord(&samRec);
}
Ejemplo n.º 8
0
void Bam2FastQ::handlePairedCoord(SamRecord& samRec)
{
    static uint64_t readPos;
    static uint64_t matePos;
    static SamRecord* mateRec;

    // This is a paired record, so check for its mate.
    readPos = SamHelper::combineChromPos(samRec.getReferenceID(),
                                         samRec.get0BasedPosition());
    matePos = SamHelper::combineChromPos(samRec.getMateReferenceID(), 
                                         samRec.get0BasedMatePosition());
 
    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate is prior to this position, so look for this record.
        mateRec = myMateMap.getMate(samRec);
        if(mateRec == NULL)
        {
            // If they are the same position, add it to the map.
            if(matePos == readPos)
            {
                myMateMap.add(samRec);

                // Check to see if the mate map can be cleaned up prior
                // to this position.
                cleanUpMateMap(readPos);
            }
            else
            {
                // Paired Read, but could not find mate.
                std::cerr << "Paired Read, " << samRec.getReadName()
                          << " but couldn't find mate, so writing as "
                          << "unpaired (single-ended)\n";
                ++myNumMateFailures;
                writeFastQ(samRec, myUnpairedFile, myUnpairedFileNameExt);
            }
        }
        else
        {
            // Found the mate.
            ++myNumPairs;
            // Check which is the first in the pair.
            if(SamFlag::isFirstFragment(samRec.getFlag()))
            {
                if(SamFlag::isFirstFragment(mateRec->getFlag()))
                {
                    std::cerr << "Both reads of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(samRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str());
                writeFastQ(*mateRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str());
            }
            else
            {
                if(!SamFlag::isFirstFragment(mateRec->getFlag()))
                {
                    std::cerr << "Neither read of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(*mateRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str());
                writeFastQ(samRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str());
            }
        }
    }
    else
    {
        // Haven't gotten to the mate yet, so store it.
        myMateMap.add(samRec);

        // Check to see if the mate map can be cleaned up.
        cleanUpMateMap(readPos);
    }
}
Ejemplo n.º 9
0
int main(int argc, char ** argv)
{
  gpLogger = new Logger;

  static struct option getopt_long_options[] = 
    {
      // Input options
      { "fasta", required_argument, NULL, 'f'},
      { "in", required_argument, NULL, 'i'},
      { "out", required_argument, NULL, 'o'},
      { "verbose", no_argument, NULL, 'v'},
      { "log", required_argument, NULL, 'l'},
      { "clear", no_argument, NULL, 0},
      { "AS", required_argument, NULL, 0},
      { "UR", required_argument, NULL, 0},
      { "SP", required_argument, NULL, 0},
      { "HD", required_argument, NULL, 0},
      { "RG", required_argument, NULL, 0},
      { "PG", required_argument, NULL, 0},
      { "checkSQ", no_argument, NULL, 0},
      { NULL, 0, NULL, 0 },
    };

  int n_option_index = 0, c;
  
  std::string sAS, sUR, sSP, sFasta, sInFile, sOutFile, sLogFile;
  bool bClear, bCheckSQ, bVerbose;
  std::vector<std::string> vsHDHeaders, vsRGHeaders, vsPGHeaders;

  bCheckSQ = bVerbose = false;
  bClear = true;

  while ( (c = getopt_long(argc, argv, "vf:i:o:l:", getopt_long_options, &n_option_index)) != -1 ) {
      //    std::cout << getopt_long_options[n_option_index].name << "\t" << optarg << std::endl;
    if ( c == 'f' ) {
      sFasta = optarg;
    }
    else if ( c == 'i' ) {
      sInFile = optarg;
    }
    else if ( c == 'o' ) {
      sOutFile = optarg;
    }
    else if ( c == 'v' ) {
      bVerbose = true;
    }
    else if ( c == 'l' ) {
	sLogFile = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"AS") == 0 ) {
      sAS = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"UR") == 0 ) {
      sUR = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"SP") == 0 ) {
      sSP = optarg;
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"HD") == 0 ) {
      vsHDHeaders.push_back(optarg);
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"RG") == 0 ) {
      vsRGHeaders.push_back(optarg);
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"PG") == 0 ) {
      vsPGHeaders.push_back(optarg);
    }
    else if ( strcmp(getopt_long_options[n_option_index].name,"checkSQ") == 0 ) {
      bCheckSQ = true;
    }
    else {
      std::cerr << "Error: Unrecognized option " << getopt_long_options[n_option_index].name << std::endl;
      abort();
    }
  }

  if ( optind < argc ) {
    printUsage(std::cerr);
    gpLogger->error("non-option argument %s exist ",argv[optind]);
  }

  if ( sInFile.empty() || sOutFile.empty() ) {
    printUsage(std::cerr);
    gpLogger->error("Input and output files are required");
  }

  if ( sLogFile.compare("__NONE__") == 0 ) {
    sLogFile = (sOutFile + ".log");
  }

  gpLogger->open(sLogFile.c_str(), bVerbose);

  if ( ( bCheckSQ ) && ( sFasta.empty() ) ) {
    printUsage(std::cerr);
    gpLogger->error("--checkSQ option must be used with --fasta option");
  }

  // check whether each header line starts with a correct tag
  checkHeaderStarts(vsHDHeaders, "@HD\t");
  checkHeaderStarts(vsRGHeaders, "@RG\t");
  checkHeaderStarts(vsPGHeaders, "@PG\t");

  gpLogger->write_log("Arguments in effect:");
  gpLogger->write_log("\t--in [%s]",sInFile.c_str());
  gpLogger->write_log("\t--out [%s]",sOutFile.c_str());
  gpLogger->write_log("\t--log [%s]",sLogFile.c_str());
  gpLogger->write_log("\t--fasta [%s]",sFasta.c_str());
  gpLogger->write_log("\t--AS [%s]",sAS.c_str());
  gpLogger->write_log("\t--UR [%s]",sUR.c_str());
  gpLogger->write_log("\t--SP [%s]",sSP.c_str());
  gpLogger->write_log("\t--checkSQ [%s]",bClear ? "ON" : "OFF" );
  if ( vsHDHeaders.empty() ) {
    gpLogger->write_log("\t--HD []");
  }
  else {
    gpLogger->write_log("\t--HD [%s]",vsHDHeaders[0].c_str());
  }
  if ( vsRGHeaders.empty() ) {
    gpLogger->write_log("\t--RG []");
  }
  else {
    gpLogger->write_log("\t--RG [%s]",vsRGHeaders[0].c_str());
  }
  if ( vsPGHeaders.empty() ) {
    gpLogger->write_log("\t--PG []");
  }
  else {
    for(uint32_t i=0; i < vsPGHeaders.size(); ++i) {
      gpLogger->write_log("\t--PG [%s]",vsPGHeaders[i].c_str());
    }
  }

  if ( (vsHDHeaders.empty() ) && ( vsRGHeaders.empty() ) && ( vsPGHeaders.empty() ) && ( !bClear ) && ( sFasta.empty() ) ) {
    gpLogger->warning("No option is in effect for modifying BAM files. The input and output files will be identical");
  }

  if ( ( vsHDHeaders.size() > 1 ) || ( vsRGHeaders.size() > 1 ) ) {
    gpLogger->error("HD and RG headers cannot be multiple");
  }

  FastaFile fastaFile;
  if ( ! sFasta.empty() ) {
    if ( fastaFile.open(sFasta.c_str()) ) {
      gpLogger->write_log("Reading the reference file %s",sFasta.c_str());
      fastaFile.readThru();
      fastaFile.close();
      gpLogger->write_log("Finished reading the reference file %s",sFasta.c_str());      
    }
    else {
      gpLogger->error("Failed to open reference file %s",sFasta.c_str());
    }
  }

  SamFile samIn;
  SamFile samOut;

  if ( ! samIn.OpenForRead(sInFile.c_str()) ) {
    gpLogger->error("Cannot open BAM file %s for reading - %s",sInFile.c_str(), SamStatus::getStatusString(samIn.GetStatus()) );
  }
  if ( ! samOut.OpenForWrite(sOutFile.c_str()) ) {
    gpLogger->error("Cannot open BAM file %s for writing - %s",sOutFile.c_str(), SamStatus::getStatusString(samOut.GetStatus()) );
  }

  SamFileHeader samHeader;
  SamHeaderRecord* pSamHeaderRecord;
  samIn.ReadHeader(samHeader);

  // check the sanity of SQ file
  // make sure the SN and LN matches, with the same order
  if ( bCheckSQ ) {
    unsigned int numSQ = 0;
    while( (pSamHeaderRecord = samHeader.getNextHeaderRecord()) != NULL ) {
      if ( pSamHeaderRecord->getType() == SamHeaderRecord::SQ ) {
	++numSQ;
      }
    }

    if ( numSQ != fastaFile.vsSequenceNames.size() ) {
      gpLogger->error("# of @SQ tags are different from the original BAM and the reference file");
    }

    // iterator over all @SQ objects
    for(unsigned int i=0; i < numSQ; ++i) {
      pSamHeaderRecord = samHeader.getSQ(fastaFile.vsSequenceNames[i].c_str());
      if ( fastaFile.vsSequenceNames[i].compare(pSamHeaderRecord->getTagValue("SN")) != 0 ) {
	gpLogger->error("SequenceName is not identical between fasta and input BAM file");
      }
      else if ( static_cast<int>(fastaFile.vnSequenceLengths[i]) != atoi(pSamHeaderRecord->getTagValue("LN")) ) {
	gpLogger->error("SequenceLength is not identical between fasta and input BAM file");
      }
      else {
	if ( !sAS.empty() ) 
	  samHeader.setSQTag("AS",sAS.c_str(),fastaFile.vsSequenceNames[i].c_str());
	samHeader.setSQTag("M5",fastaFile.vsMD5sums[i].c_str(),fastaFile.vsSequenceNames[i].c_str());
	if ( !sUR.empty() ) 
	  samHeader.setSQTag("UR",sUR.c_str(),fastaFile.vsSequenceNames[i].c_str());
	if ( !sSP.empty() ) 
	  samHeader.setSQTag("SP",sSP.c_str(),fastaFile.vsSequenceNames[i].c_str());
      }
    }
    gpLogger->write_log("Finished checking the consistency of SQ tags");
  }
  else {
    gpLogger->write_log("Skipped checking the consistency of SQ tags");
  }

  // go over the headers again, 
  // assuming order of HD, SQ, RG, PG, and put proper tags at the end of the original tags

  gpLogger->write_log("Creating the header of new output file");
  //SamFileHeader outHeader;
  samHeader.resetHeaderRecordIter();

  for(unsigned int i=0; i < vsHDHeaders.size(); ++i) {
    samHeader.addHeaderLine(vsHDHeaders[i].c_str());
  }

  /*
  for(int i=0; i < fastaFile.vsSequenceNames.size(); ++i) {
    std::string s("@SQ\tSN:");
    char buf[1024];
    s += fastaFile.vsSequenceNames[i];
    sprintf(buf,"\tLN:%d",fastaFile.vnSequenceLengths[i]);
    s += buf;
    if ( !sAS.empty() ) {
      sprintf(buf,"\tAS:%s",sAS.c_str());
      s += buf;
    }
    if ( !sUR.empty() ) {
      sprintf(buf,"\tUR:%s",sUR.c_str());
      s += buf;
    }
    sprintf(buf,"\tM5:%s",fastaFile.vsMD5sums[i].c_str());
    s += buf;
    if ( !sSP.empty() ) {
      sprintf(buf,"\tSP:%s",sSP.c_str());
      s += buf;
    }
    outHeader.addHeaderLine(s.c_str());
    }*/

  for(unsigned int i=0; i < vsRGHeaders.size(); ++i) {
    samHeader.addHeaderLine(vsRGHeaders[i].c_str());
  }

  for(unsigned int i=0; i < vsPGHeaders.size(); ++i) {
    samHeader.addHeaderLine(vsPGHeaders[i].c_str());
  }

  samOut.WriteHeader(samHeader);
  gpLogger->write_log("Adding %d HD, %d RG, and %d PG headers",vsHDHeaders.size(), vsRGHeaders.size(), vsPGHeaders.size());
  gpLogger->write_log("Finished writing output headers");

  // parse RG tag and get RG ID to append
  std::string sRGID;
  if ( ! vsRGHeaders.empty() ) {
    std::vector<std::string> tokens;
    FastaFile::tokenizeString( vsRGHeaders[0].c_str(), tokens );
    for(unsigned int i=0; i < tokens.size(); ++i) {
      if ( tokens[i].find("ID:") == 0 ) {
	sRGID = tokens[i].substr(3);
      }
    }
  }
  
  gpLogger->write_log("Writing output BAM file");
  SamRecord samRecord;
  while (samIn.ReadRecord(samHeader, samRecord) == true) {
    if ( !sRGID.empty() ) {
      if ( samRecord.addTag("RG",'Z',sRGID.c_str()) == false ) {
	gpLogger->error("Failed to add a RG tag %s",sRGID.c_str());
      }
      // temporary code added
      if ( strncmp(samRecord.getReadName(),"seqcore_",8) == 0 ) {
	char buf[1024];
	sprintf(buf,"UM%s",samRecord.getReadName()+8);
	samRecord.setReadName(buf);
      }
    }
    samOut.WriteRecord(samHeader, samRecord);
    //if ( samIn.GetCurrentRecordCount() == 1000 ) break;
  }
  samOut.Close();
  gpLogger->write_log("Successfully written %d records",samIn.GetCurrentRecordCount());
  delete gpLogger;
  return 0;
}
Ejemplo n.º 10
0
SamStatus::Status ClipOverlap::handleSortedByReadName(SamFile& samIn, 
                                                      SamFile* samOutPtr)
{
    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    // Read the sam records.
    SamRecord* prevSamRecord = NULL;
    SamRecord* samRecord = new SamRecord;
    SamRecord* tmpRecord = new SamRecord;
    if((samRecord == NULL) || (tmpRecord == NULL))
    {
        std::cerr << "Failed to allocate a SamRecord, so exit.\n";
        return(SamStatus::FAIL_MEM);
    }

    // Keep reading records until ReadRecord returns false.
    while(samIn.ReadRecord(mySamHeader, *samRecord))
    {
        int16_t flag = samRecord->getFlag();
        if((flag & myIntExcludeFlags) != 0)
        {
            // This read should not be checked for overlaps.

            // Check if there is a previous SamRecord.
            if(prevSamRecord != NULL)
            {
                // There is a previous record.
                // If it has a different read name, write it.
                if(strcmp(samRecord->getReadName(), 
                          prevSamRecord->getReadName()) != 0)
                {
                    // Different read name, so write the previous record.
                    if((samOutPtr != NULL) && !myOverlapsOnly)
                    {
                        if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord))
                        {
                            // Failed to write a record.
                            fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                            returnStatus = samOutPtr->GetStatus();
                        }
                    }
                    // Clear the previous record info.
                    tmpRecord = prevSamRecord;
                    prevSamRecord = NULL;
                } 
                // If it has the same read name, leave it in case there is another read with that name
            }
            // This record is not being checked for overlaps, so just write it and continue
            if((samOutPtr != NULL) && !myOverlapsOnly)
            {
                if(!samOutPtr->WriteRecord(mySamHeader, *samRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                    returnStatus = samOutPtr->GetStatus();
                }
            }
            continue;
        }

        if(prevSamRecord == NULL)
        {
            // Nothing to compare this record to, so set this
            // record to the previous, and the next record.
            prevSamRecord = samRecord;
            samRecord = tmpRecord;
            tmpRecord = NULL;
            continue;
        }

        // Check if the read name matches the previous read name.
        if(strcmp(samRecord->getReadName(), 
                  prevSamRecord->getReadName()) == 0)
        {
            bool overlap = false;
            // Same Read Name, so check clipping.
            OverlapHandler::OverlapInfo prevClipInfo = 
                myOverlapHandler->getOverlapInfo(*prevSamRecord);
            OverlapHandler::OverlapInfo curClipInfo = 
                myOverlapHandler->getOverlapInfo(*samRecord);
            
            // If either indicate a complete clipping, clip both.
            if((prevClipInfo == OverlapHandler::NO_OVERLAP_WRONG_ORIENT) ||
               (curClipInfo == OverlapHandler::NO_OVERLAP_WRONG_ORIENT))
            {
                overlap = true;
                myOverlapHandler->handleNoOverlapWrongOrientation(*prevSamRecord);
                // Don't update stats since this is the 2nd in the pair
                myOverlapHandler->handleNoOverlapWrongOrientation(*samRecord, 
                                                                  false);
            }
            else if((prevClipInfo == OverlapHandler::OVERLAP) ||
                    (prevClipInfo == OverlapHandler::SAME_START))
            {
                // The previous read starts at or before the current one.
                overlap = true;
                myOverlapHandler->handleOverlapPair(*prevSamRecord,
                                                    *samRecord);
            }
            else if(curClipInfo == OverlapHandler::OVERLAP)
            {
                // The current read starts before the previous one.
                overlap = true;
                myOverlapHandler->handleOverlapPair(*samRecord,
                                                    *prevSamRecord);
            }
            
            // Found a read pair, so write both records if: 
            //   1) output file is specified
            //   AND
            //     2a) all records should be written
            //     OR
            //     2b) the pair overlaps
            if((samOutPtr != NULL) && (!myOverlapsOnly || overlap))
            {
                if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                    returnStatus = samOutPtr->GetStatus();
                }
                if(!samOutPtr->WriteRecord(mySamHeader, *samRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                    returnStatus = samOutPtr->GetStatus();
                }
            }
            // Setup for the next read with no previous.
            tmpRecord = prevSamRecord;
            prevSamRecord = NULL;
            
        }
        else
        {
            // Read name does not match, so write the previous record
            // if we are writing all records.
            if((samOutPtr != NULL) && !myOverlapsOnly)
            {
                if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                    returnStatus = samOutPtr->GetStatus();
                }
            }
            // Store this record as the previous.
            tmpRecord = prevSamRecord;
            prevSamRecord = samRecord;
            samRecord = tmpRecord;
            tmpRecord = NULL;
        }
    }

    // Write the previous record if there is one.
    if((samOutPtr != NULL) && (prevSamRecord != NULL) && !myOverlapsOnly)
    {
        if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord))
        {
            // Failed to write a record.
            fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
            returnStatus = samOutPtr->GetStatus();
        }
        delete prevSamRecord;
    }

    if(samRecord != NULL)
    {
        delete samRecord;
    }
    if(tmpRecord != NULL)
    {
        delete tmpRecord;
    }

    if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
    {
        return(samIn.GetStatus());
    }
    return(returnStatus);
}
Ejemplo n.º 11
0
// Validate that the record is sorted compared to the previously read record
// if there is one, according to the specified sort order.
// If the sort order is UNSORTED, true is returned.
bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header)
{
    if(myRefPtr != NULL)
    {
        record.setReference(myRefPtr);
    }
    record.setSequenceTranslation(myReadTranslation);

    bool status = false;
    if(mySortedType == UNSORTED)
    {
        // Unsorted, so nothing to validate, just return true.
        status = true;
    }
    else 
    {
        // Check to see if mySortedType is based on the header.
        if(mySortedType == FLAG)
        {
            // Determine the sorted type from what was read out of the header.
            mySortedType = getSortOrderFromHeader(header);
        }

        if(mySortedType == QUERY_NAME)
        {
            // Validate that it is sorted by query name.
            // Get the query name from the record.
            const char* readName = record.getReadName();

            // Check if it is sorted either in samtools way or picard's way.
            if((myPrevReadName.Compare(readName) > 0) &&
               (strcmp(myPrevReadName.c_str(), readName) > 0))
            {
                // return false.
                String errorMessage = "ERROR: File is not sorted by read name at record ";
                errorMessage += myRecordCount;
                errorMessage += "\n\tPrevious record was ";
                errorMessage += myPrevReadName;
                errorMessage += ", but this record is ";
                errorMessage += readName;
                myStatus.setStatus(SamStatus::INVALID_SORT, 
                                   errorMessage.c_str());
                status = false;
            }
            else
            {
                myPrevReadName = readName;
                status = true;
            }
        }
        else 
        {
            // Validate that it is sorted by COORDINATES.
            // Get the leftmost coordinate and the reference index.
            int32_t refID = record.getReferenceID();
            int32_t coord = record.get0BasedPosition();
            // The unmapped reference id is at the end of a sorted file.
            if(refID == BamIndex::REF_ID_UNMAPPED)
            {
                // A new reference ID that is for the unmapped reads
                // is always valid.
                status = true;
                myPrevRefID = refID;
                myPrevCoord = coord;
            }
            else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED)
            {
                // Previous reference ID was for unmapped reads, but the
                // current one is not, so this is not sorted.
                String errorMessage = "ERROR: File is not coordinate sorted at record ";
                errorMessage += myRecordCount;
                errorMessage += "\n\tPrevious record was unmapped, but this record is ";
                errorMessage += header.getReferenceLabel(refID) + ":" + coord;
                myStatus.setStatus(SamStatus::INVALID_SORT, 
                                   errorMessage.c_str());
                status = false;
            }
            else if(refID < myPrevRefID)
            {
                // Current reference id is less than the previous one, 
                //meaning that it is not sorted.
                String errorMessage = "ERROR: File is not coordinate sorted at record ";
                errorMessage += myRecordCount;
                errorMessage += "\n\tPrevious record was ";
                errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord;
                errorMessage += ", but this record is ";
                errorMessage += header.getReferenceLabel(refID) + ":" + coord;
                myStatus.setStatus(SamStatus::INVALID_SORT, 
                                   errorMessage.c_str());
                status = false;
            }
            else
            {
                // The reference IDs are in the correct order.
                if(refID > myPrevRefID)
                {
                    // New reference id, so set the previous coordinate to -1
                    myPrevCoord = -1;
                }
            
                // Check the coordinates.
                if(coord < myPrevCoord)
                {
                    // New Coord is less than the previous position.
                    String errorMessage = "ERROR: File is not coordinate sorted at record ";
                    errorMessage += myRecordCount;
                    errorMessage += "\n\tPreviousRecord was ";
                    errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord;
                    errorMessage += ", but this record is ";
                    errorMessage += header.getReferenceLabel(refID) + ":" + coord;
                    myStatus.setStatus(SamStatus::INVALID_SORT, 
                                       errorMessage.c_str());
                    status = false;
                }
                else
                {
                    myPrevRefID = refID;
                    myPrevCoord = coord;
                    status = true;
                }
            }
        }
    }

    return(status);
}
Ejemplo n.º 12
0
bool BamProcessor::processRecord ()
{
    trclog << "\nProcessing record " << read_cnt_ << " - " << rec_.getReadName () << ", " << rec_.get0BasedUnclippedEnd () << "->" << rec_.getReadLength () << ", ref " << rec_.getReferenceName () << std::endl;
    const char* seq = rec_.getSequence ();
    unsigned position = rec_.get0BasedPosition ();
    unsigned new_position = position;
    bool reverse_match = (rec_.getFlag () & 0x10);

    Cigar* cigar_p = rec_.getCigarInfo ();
    if (!cigar_p->size ())  // can not recreate reference is cigar is missing. Keep record unaligned.
    {                       // TODO: allow to specify and load external reference
        ++ unaligned_cnt_;
        return true;
    }

    myassert (cigar_p);

    const String *mdval = rec_.getStringTag ("MD");
    if (!mdval) // can not recreate reference is MD tag is missing. Keep record as is.
    {
        warn << "No MD Tag for record " << proc_cnt_ << ". Skipping record." << std::endl;
        ++nomd_cnt_;
        return true; // record will be kept as-is.
    }
    std::string md_tag = mdval->c_str ();

    // find the non-clipped region
    uint32_t clean_len;
    EndClips clips;
    const char* clean_read = clip_seq (seq, *cigar_p, clean_len, clips);

    // find length needed for the reference
    // this reserves space enough for entire refference, including softclipped ends.
    unsigned ref_len = cigar_p->getExpectedReferenceBaseCount ();
    if (ref_buffer_sz_ < ref_len)
    {
        ref_buffer_sz_ = (1 + ref_len / REF_BUF_INCR) * REF_BUF_INCR;
        ref_buffer_.reset (ref_buffer_sz_);
    }
    if (clean_len > MAX_SEQ_LEN || ref_len > MAX_SEQ_LEN)
    {
        ++ toolongs_;
        return true;
    }

    // recreate reference by Query, Cigar, and MD tag. Do not include softclipped ends in the recreated sequence (use default last parameter)
    recreate_ref (seq, rec_.getReadLength (), cigar_p, md_tag.c_str (), ref_buffer_, ref_buffer_sz_);

    unsigned qry_ins; // extra bases in query     == width_left
    unsigned ref_ins; // extra bases in reference == width_right
    band_width (*cigar_p, qry_ins, ref_ins);

    if (log_matr_ || log_base_)
    {
        logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << "\n"
                 << "   sequence (" << rec_.getReadLength () << " bases)\n";
    }

    CigarRoller roller;
    int ref_shift = 0;  // shift of the new alignment position on refereance relative the original
    unsigned qry_off, ref_off; // offsets on the query and reference of the first non-clipped aligned bases
    double new_score = 0;

    switch (p_->algo ())
    {
        case ContalignParams::TEMPL:
        {
            // call aligner
            new_score = taligner_.eval (clean_read, clean_len, ref_buffer_, ref_len, 0, band_width_);
            // read traceback
            // TODO: convert directly to cigar
            genstr::Alignment* al = taligner_.trace ();
            // convert alignment to cigar
            ref_shift = roll_cigar (roller, *al, clean_len, clips, qry_off, ref_off);
        }
        break;
        case ContalignParams::PLAIN:
        {
            new_score = aligner_.align_band (
                clean_read,                     // xseq
                clean_len,                      // xlen
                ref_buffer_,                    // yseq
                ref_len,                        // ylen
                0,                              // xpos
                0,                              // ypos
                std::max (clean_len, ref_len),  // segment length
                qry_ins + band_width_,          // width_left
                false,                          // unpack
                ref_ins + band_width_,          // width_right - forces to width_left
                true,                           // to_beg
                true                            // to_end
                );
            unsigned bno = aligner_.backtrace (
                    batches_,      // BATCH buffer
                    max_batch_no_, // size of BATCH buffer
                    false,         // fill the BATCH array in reverse direction
                    ref_ins + band_width_ // width
                                    );
            // convert alignment to cigar
            ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off);
        }
        break;
        case ContalignParams::POLY:
        {
            new_score = contalign_.align_band (
                clean_read,                     // xseq
                clean_len,                      // xlen
                ref_buffer_,                    // yseq
                ref_len,                        // ylen
                0,                              // xpos
                0,                              // ypos
                std::max (clean_len, ref_len),  // segment length
                qry_ins + band_width_,          // width_left
                false,                          // unpack
                ref_ins + band_width_,          // width_right - forces to width_left
                true,                           // to_beg
                true                            // to_end
                );
            unsigned bno = contalign_.backtrace (
                    batches_,      // BATCH buffer
                    max_batch_no_, // size of BATCH buffer
                    false,         // fill the BATCH array in reverse direction
                    ref_ins + band_width_ // width
                                    );
            // convert alignment to cigar
            ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off);
        }
        break;
        default:
        break;
    }
    ++realigned_cnt_;
    // compare original and new cigar (and location)
    if (ref_shift || !(*cigar_p == roller))
    {
        // save original cigar and position for reporting
        std::string orig_cigar_str;
        rec_.getCigarInfo ()->getCigarString (orig_cigar_str);
        int32_t prior_pos = rec_.get0BasedPosition ();

        // replace cigar
        rec_.setCigar (roller);
        ++ modified_cnt_;
        // update pos_adjusted_cnt if position changed
        if (ref_shift != 0)
        {
            myassert (prior_pos + ref_shift >= 0);
            rec_.set0BasedPosition (prior_pos + ref_shift);
            ++ pos_adjusted_cnt_;
        }
        if (log_diff_)
        {
            const unsigned MAX_BATCH_PRINTED = 100;
            BATCH batches [MAX_BATCH_PRINTED];
            std::string new_cigar_str;
            unsigned bno;
            int swscore;

            rec_.getCigarInfo ()->getCigarString (new_cigar_str);
            if (!log_base_ && !log_matr_)
                logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << " (" << rec_.getReadLength () << " bases)\n";

            logfile_ << "   ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" <<  orig_cigar_str << "\n";
            bno = cigar_to_batches (orig_cigar_str, batches, MAX_BATCH_PRINTED);
            swscore = align_score (batches, bno, clean_read, ref_buffer_, p_->gip (), p_->gep (), p_->mat (), p_->mis ());
            print_batches (clean_read, clean_len, false, ref_buffer_, ref_len, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_, clips.soft_beg_, 0, 160);
            logfile_ << "\n     'classic' SW score is " << swscore << "\n";

            logfile_ << "   NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" <<  new_cigar_str << std::endl;
            bno = cigar_to_batches (new_cigar_str, batches, MAX_BATCH_PRINTED);
            swscore = align_score (batches, bno, clean_read + qry_off, ref_buffer_ + ref_off, p_->gip (), p_->gep (), p_->mat (), p_->mis ());
            print_batches (clean_read + qry_off, clean_len - qry_off, false, ref_buffer_ + ref_off, ref_len - ref_off, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_ + ref_off, clips.soft_beg_ + qry_off, 0, 160);
            logfile_ << "\n      'classic' SW score is " << swscore;
            logfile_ << "\n      alternate (context-aware) score is " << new_score << ", used bandwidth left: " << qry_ins + band_width_ << ", right: " << ref_ins + band_width_ << "\n" << std::endl;
        }
        else if (log_base_)
        {
            logfile_ << "Recomputed alignment differs from original:\n";
            logfile_ << "   ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" <<  orig_cigar_str << "\n";
            std::string new_cigar_str;
            rec_.getCigarInfo ()->getCigarString (new_cigar_str);
            logfile_ << "    NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" <<  new_cigar_str << "\n" << std::endl;
        }
    }
    else
    {
        if (log_base_)
        {
            logfile_ << "Recomputed alignment matches the original:\n";
            std::string orig_cigar_str;
            rec_.getCigarInfo ()->getCigarString (orig_cigar_str);
            int32_t prior_pos = rec_.get0BasedPosition ();
            logfile_ << "   " << std::right << std::setw (9) << prior_pos+1 << "->" <<  orig_cigar_str << "\n" << std::endl;
        }
    }
    return true;
}
Ejemplo n.º 13
0
void validateRead1ModQuality(SamRecord& samRecord)
{
    //////////////////////////////////////////
    // Validate Record 1
    // Create record structure for validating.
    int expectedBlockSize = 89;
    const char* expectedReferenceName = "1";
    const char* expectedMateReferenceName = "1";
    const char* expectedMateReferenceNameOrEqual = "=";

    bamRecordStruct* expectedRecordPtr =
        (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int));

    char tag[3];
    char type;
    void* value;
    bamRecordStruct* bufferPtr;
    unsigned char* varPtr;

    expectedRecordPtr->myBlockSize = expectedBlockSize;
    expectedRecordPtr->myReferenceID = 0;
    expectedRecordPtr->myPosition = 1010;
    expectedRecordPtr->myReadNameLength = 23;
    expectedRecordPtr->myMapQuality = 0;
    expectedRecordPtr->myBin = 4681;
    expectedRecordPtr->myCigarLength = 2;
    expectedRecordPtr->myFlag = 73;
    expectedRecordPtr->myReadLength = 5;
    expectedRecordPtr->myMateReferenceID = 0;
    expectedRecordPtr->myMatePosition = 1010;
    expectedRecordPtr->myInsertSize = 0;
   
    // Check the alignment end
    assert(samRecord.get0BasedAlignmentEnd() == 1016);
    assert(samRecord.get1BasedAlignmentEnd() == 1017);
    assert(samRecord.getAlignmentLength() == 7);
    assert(samRecord.get0BasedUnclippedStart() == 1010);
    assert(samRecord.get1BasedUnclippedStart() == 1011);
    assert(samRecord.get0BasedUnclippedEnd() == 1016);
    assert(samRecord.get1BasedUnclippedEnd() == 1017);

    // Check the accessors.
    assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize);
    assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID);
    assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0);
    assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1);
    assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition);
    assert(samRecord.getReadNameLength() == 
           expectedRecordPtr->myReadNameLength);
    assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality);
    assert(samRecord.getBin() == expectedRecordPtr->myBin);
    assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength);
    assert(samRecord.getFlag() == expectedRecordPtr->myFlag);
    assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength);
    assert(samRecord.getMateReferenceID() ==
           expectedRecordPtr->myMateReferenceID);
    assert(strcmp(samRecord.getMateReferenceName(), 
                  expectedMateReferenceName) == 0);
    assert(strcmp(samRecord.getMateReferenceNameOrEqual(), 
                  expectedMateReferenceNameOrEqual) == 0);
    assert(samRecord.get1BasedMatePosition() == 
           expectedRecordPtr->myMatePosition + 1);
    assert(samRecord.get0BasedMatePosition() ==
           expectedRecordPtr->myMatePosition);
    assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize);
    assert(strcmp(samRecord.getReadName(), "1:1011:F:255+17M15D20M") == 0);
    assert(strcmp(samRecord.getCigar(), "5M2D") == 0);
    assert(strcmp(samRecord.getSequence(), "CCGAA") == 0);
    assert(strcmp(samRecord.getQuality(), "ABCDE") == 0);
    assert(samRecord.getNumOverlaps(1010, 1017) == 5);
    assert(samRecord.getNumOverlaps(1010, 1016) == 5);
    assert(samRecord.getNumOverlaps(1012, 1017) == 3);
    assert(samRecord.getNumOverlaps(1015, 1017) == 0);
    assert(samRecord.getNumOverlaps(1017, 1010) == 0);
    assert(samRecord.getNumOverlaps(1013, 1011) == 0);
    assert(samRecord.getNumOverlaps(-1, 1017) == 5);

    // Reset the tag iter, since the tags have already been read.
    samRecord.resetTagIter();

    // Check the tags.
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'A');
    assert(tag[1] == 'M');
    assert(type == 'i');
    assert(*(char*)value == 0);
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'M');
    assert(tag[1] == 'D');
    assert(type == 'Z');
    assert(*(String*)value == "37");
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'N');
    assert(tag[1] == 'M');
    assert(type == 'i');
    assert(*(char*)value == 0);
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'X');
    assert(tag[1] == 'T');
    assert(type == 'A');
    assert(*(char*)value == 'R');
    // No more tags, should return false.
    assert(samRecord.getNextSamTag(tag, type, &value) == false);
    assert(samRecord.getNextSamTag(tag, type, &value) == false);

    // Get the record ptr.   
    bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer();
    // Validate the buffers match.
    assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize);
    assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID);
    assert(bufferPtr->myPosition == expectedRecordPtr->myPosition);
    assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength);
    assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality);
    assert(bufferPtr->myBin == expectedRecordPtr->myBin);
    assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength);
    assert(bufferPtr->myFlag == expectedRecordPtr->myFlag);
    assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength);
    assert(bufferPtr->myMateReferenceID ==
           expectedRecordPtr->myMateReferenceID);
    assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition);
    assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize);

    // Validate the variable length fields in the buffer.
    // Set the pointer to the start of the variable fields.
    varPtr = (unsigned char*)(&(bufferPtr->myData[0]));

    // Validate the readname.
    for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++)
    {
        assert(*varPtr == samRecord.getReadName()[i]);
        varPtr++;
    }

    // Validate the cigar.
    // The First cigar is 5M which is 5 << 4 | 0 = 80
    assert(*(unsigned int*)varPtr == 80);
    // Increment the varptr the size of an int.
    varPtr += 4;
    // The 2nd cigar is 2D which is 2 << 4 | 2 = 34
    assert(*(unsigned int*)varPtr == 34);
    // Increment the varptr the size of an int.
    varPtr += 4;
   
    // Validate the sequence.
    // CC = 0x22
    assert(*varPtr == 0x22);
    varPtr++;
    // GA = 0x41
    assert(*varPtr == 0x41);
    varPtr++;
    // A  = 0x10
    assert(*varPtr == 0x10);
    varPtr++;
  
    // Validate the Quality
    for(int i = 0; i < expectedRecordPtr->myReadLength; i++)
    {
        assert(*varPtr == samRecord.getQuality()[i] - 33);
        varPtr++;
    }

    // Validate the tags.  
    assert(*varPtr == 'A');
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'C');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'D');
    varPtr++;
    assert(*varPtr == 'Z');
    varPtr++;
    assert(*varPtr == '3');
    varPtr++;
    assert(*varPtr == '7');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'N');
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'C');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'X');
    varPtr++;
    assert(*varPtr == 'T');
    varPtr++;
    assert(*varPtr == 'A');
    varPtr++;
    assert(*varPtr == 'R');
    varPtr++;
}
Ejemplo n.º 14
0
// When a record is read, check if it is a duplicate or
// store for future checking.
void Dedup::checkDups(SamRecord& record, uint32_t recordCount)
{
    // Only inside this method if the record is mapped.

    // Get the key for this record.
    static DupKey key;
    static DupKey mateKey;
    key.updateKey(record, getLibraryID(record));

    int flag = record.getFlag(); 
    bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
    int sumBaseQual = getBaseQuality(record);

    int32_t chromID = record.getReferenceID();
    int32_t mateChromID = record.getMateReferenceID();

    // If we are one-chrom and the mate is not on the same chromosome, 
    // mark it as not paired.
    if(myOneChrom && (chromID != mateChromID))
    {
        recordPaired = false;
    }
    
    // Look in the map to see if an entry for this key exists.
    FragmentMapInsertReturn ireturn = 
        myFragmentMap.insert(std::make_pair(key, ReadData()));

    ReadData* readData = &(ireturn.first->second);

    // Mark this record's data in the fragment record if this is the first
    // entry or if it is a duplicate and the old record is not paired and 
    // the new record is paired or the has a higher quality.
    if((ireturn.second == true) ||
       ((readData->paired == false) && 
        (recordPaired || (sumBaseQual > readData->sumBaseQual))))
    {
        // If there was a previous record, mark it duplicate and release
        // the old record
        if(ireturn.second == false)
        {
            // Mark the old record as a DUPLICATE!
            handleDuplicate(readData->recordIndex, readData->recordPtr);
        }
        // Store this record for later duplicate checking.
        readData->sumBaseQual = sumBaseQual;
        readData->recordIndex = recordCount;
        readData->paired = recordPaired;
        if(recordPaired)
        {
            readData->recordPtr = NULL;
        }
        else
        {
            readData->recordPtr = &record;
        }
    }
    else
    {
        // The old record is not a duplicate so the new record is
        // a duplicate if it is not paired.
        if(recordPaired == false)
        {
            // This record is a duplicate, so mark it and release it.
            handleDuplicate(recordCount, &record);
        }
    }

    // Only paired processing is left, so return if not paired.
    if(recordPaired == false)
    {
        // Not paired, no more operations required, so return.
        return;
    }
    
    // This is a paired record, so check for its mate.
    uint64_t readPos = 
        SamHelper::combineChromPos(chromID,
                                   record.get0BasedPosition());
    uint64_t matePos =
        SamHelper::combineChromPos(mateChromID, 
                                   record.get0BasedMatePosition());
    SamRecord* mateRecord = NULL;
    int mateIndex = 0;
    
    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate map is stored by the mate position, so look for this 
        // record's position.
        // The mate should be in the mate map, so find it.
        std::pair<MateMap::iterator,MateMap::iterator> matches =
            myMateMap.equal_range(readPos);
        // Loop through the elements that matched the pos looking for the mate.
        for(MateMap::iterator iter = matches.first; 
            iter != matches.second; iter++)
        {
            if(strcmp((*iter).second.recordPtr->getReadName(), 
                      record.getReadName()) == 0)
            {
                // Found the match.
                ReadData* mateData = &((*iter).second);
                // Update the quality and track the mate record and index.
                sumBaseQual += mateData->sumBaseQual;
                mateIndex = mateData->recordIndex;
                mateRecord = mateData->recordPtr;
                // Remove the entry from the map.
                myMateMap.erase(iter);
                break;
            }
        }
    }
    if((mateRecord == NULL) && (matePos >= readPos))
    {
        // Haven't gotten to the mate yet, so store this record.
        MateMap::iterator mateIter = 
            myMateMap.insert(std::make_pair(matePos, ReadData()));
        mateIter->second.sumBaseQual = sumBaseQual;
        mateIter->second.recordPtr = &record;
        mateIter->second.recordIndex = recordCount;
        // No more processing for this record is necessary.
        return;
    }

    if(mateRecord == NULL)
    {
        // Passed the mate, but it was not found.
        handleMissingMate(&record);
        return;
    }

    // Make the paired key.
    mateKey.updateKey(*mateRecord, getLibraryID(*mateRecord));
    PairedKey pkey(key, mateKey);

    // Check to see if this pair is a duplicate.
    PairedMapInsertReturn pairedReturn = 
        myPairedMap.insert(std::make_pair(pkey,PairedData()));
    PairedData* storedPair = &(pairedReturn.first->second);

    // Get the index for "record 1" - the one with the earlier coordinate.
    int record1Index = getFirstIndex(key, recordCount,
                                          mateKey, mateIndex);

    // Check if we have already found a duplicate pair.
    // If there is no duplicate found, there is nothing more to do.
    if(pairedReturn.second == false)
    {
        // Duplicate found.
        bool keepStored = true;
        if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
        {
            // The new pair has higher quality, so keep that.
            keepStored = false;
        }
        else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
        {
            // Same quality, so keep the one with the earlier record1Index.
            if(record1Index < storedPair->record1Index)
            {
                // The new pair has an earlier lower coordinate read,
                // so keep that.
                keepStored = false;
            }
        }
        // Check to see which one should be kept by checking qualities.
        if(keepStored)
        {
            // The old pair had higher quality so mark the new pair as a
            // duplicate and release them.
            handleDuplicate(mateIndex, mateRecord);
            handleDuplicate(recordCount, &record);
        }
        else
        {
            // The new pair has higher quality, so keep that.
            // First mark the previous one as duplicates and release them.
            handleDuplicate(storedPair->record1Index, storedPair->record1Ptr);
            handleDuplicate(storedPair->record2Index, storedPair->record2Ptr);
            // Store this pair's information.
            if(record1Index == mateIndex)
            {
                // Mate has a lower coordinate, so make mate
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Ptr = mateRecord;
                storedPair->record2Ptr = &record;
                storedPair->record1Index = mateIndex;
                storedPair->record2Index = recordCount;
            }
            else
            {
                // This record has a lower coordinate, so make it
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Ptr = &record;
                storedPair->record2Ptr = mateRecord;
                storedPair->record1Index = recordCount;
                storedPair->record2Index = mateIndex;
            }
        }
    }
    else
    {
        // Store this pair's information.
        storedPair->sumBaseQual = sumBaseQual;

        if(record1Index == mateIndex)
        {
            // Mate has a lower coordinate, so make mate
            // record1.
            storedPair->record1Ptr = mateRecord;
            storedPair->record2Ptr = &record;
            storedPair->record1Index = mateIndex;
            storedPair->record2Index = recordCount;
        }
        else
        {
            // This record has a lower coordinate, so make it
            // record1.
            storedPair->record1Ptr = &record;
            storedPair->record2Ptr = mateRecord;
            storedPair->record1Index = recordCount;
            storedPair->record2Index = mateIndex;
        }
    }
}
Ejemplo n.º 15
0
int WriteRegion::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String indexFile = "";
    String readName = "";
    String bed = "";
    myStart = UNSPECIFIED_INT;
    myEnd = UNSPECIFIED_INT;
    myPrevStart = UNSPECIFIED_INT;
    myPrevEnd = UNSPECIFIED_INT;
    myRefID = UNSET_REF;
    myRefName.Clear();
    myPrevRefName.Clear();
    myBedRefID = SamReferenceInfo::NO_REF_ID;
    bool lshift = false;
    bool noeof = false;
    bool params = false;
    myWithinReg = false;
    myWroteReg = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_PARAMETER_GROUP("Optional Region Parameters")        
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("refName", &myRefName)
        LONG_INTPARAMETER("refID", &myRefID)
        LONG_INTPARAMETER("start", &myStart)
        LONG_INTPARAMETER("end", &myEnd)
        LONG_STRINGPARAMETER("bed", &bed)
        LONG_PARAMETER("withinReg", &myWithinReg)
        LONG_STRINGPARAMETER("readName", &readName)
        LONG_PARAMETER_GROUP("Optional Other Parameters")
        LONG_PARAMETER("lshift", &lshift)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        // mandatory argument was not specified.
        inputParameters.Status();
        std::cerr << "Missing mandatory argument: --in" << std::endl;
        return(-1);
    }
    if(outFile == "")
    {
        usage();
        // mandatory argument was not specified.
        inputParameters.Status();
        std::cerr << "Missing mandatory argument: --out" << std::endl;
        return(-1);
    }
    
    if(indexFile == "")
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }

    if(myRefID != UNSET_REF && myRefName.Length() != 0)
    {
        std::cerr << "Can't specify both refID and refName" << std::endl;
        inputParameters.Status();
        return(-1);
    }
    if(myRefID != UNSET_REF && bed.Length() != 0)
    {
        std::cerr << "Can't specify both refID and bed" << std::endl;
        inputParameters.Status();
        return(-1);
    }
    if(myRefName.Length() != 0 && bed.Length() != 0)
    {
        std::cerr << "Can't specify both refName and bed" << std::endl;
        inputParameters.Status();
        return(-1);
    }

    if(!bed.IsEmpty())
    {
        myBedFile = ifopen(bed, "r");
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.   
    mySamIn.OpenForRead(inFile);

    // Open the output file for writing.
    SamFile samOut;
    samOut.OpenForWrite(outFile);

    // Open the bam index file for reading if a region was specified.
    if((myRefName.Length() != 0) || (myRefID != UNSET_REF) || (myBedFile != NULL))
    {
        mySamIn.ReadBamIndex(indexFile);
    }

    // Read & write the sam header.
    mySamIn.ReadHeader(mySamHeader);
    samOut.WriteHeader(mySamHeader);

    // Read the sam records.
    SamRecord samRecord;
    // Track the status.
    int numSectionRecords = 0;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
        
    while(getNextSection())
    {
        // Keep reading records until they aren't anymore.
        while(mySamIn.ReadRecord(mySamHeader, samRecord))
        {
            if(!readName.IsEmpty())
            {
                // Check for readname.
                if(strcmp(samRecord.getReadName(), readName.c_str()) != 0)
                {
                    // not a matching read name, so continue to the next record.
                    continue;
                }
            }
            
            // Check to see if the read has already been processed.
            if(myPrevEnd != UNSPECIFIED_INT)
            {
                // Because we already know that the bed was sorted, 
                // we know that the previous section started before
                // this one, so if the previous end is greater than
                // this record's end position we know that it
                // was already written in the previous section.
                // Note: can't be equal to the previous end since
                // the end range was exclusive, while
                // get0BasedAlignmentEnd is inclusive.
                // myPrevEnd is reset by getNextSection when a new
                // chromosome is hit.
                if(samRecord.get0BasedAlignmentEnd() < myPrevEnd)
                {
                    // This record was already written.
                    continue;
                }
            }

            // Shift left if applicable.
            if(lshift)
            {
                samRecord.shiftIndelsLeft();
            }

            // Successfully read a record from the file, so write it.
            samOut.WriteRecord(mySamHeader, samRecord);
            ++numSectionRecords;
        }
        myWroteReg = true;
    }

    if(myBedFile != NULL)
    {
        ifclose(myBedFile);
    }
    std::cerr << "Wrote " << outFile << " with " << numSectionRecords
              << " records.\n";
    return(returnStatus);
}