Ejemplo n.º 1
0
void Bam2FastQ::handlePairedRN(SamRecord& samRec)
{
    static SamRecord* prevRec = NULL;
    static std::string prevRN = "";

    if(prevRec == NULL)
    {
        prevRec = &samRec;
    }
    else
    {
        if(strcmp(prevRec->getReadName(), samRec.getReadName()) != 0)
        {
            // Read Name does not match, error, did not find pair.
            std::cerr << "Paired Read, " << prevRec->getReadName()
                      << " but couldn't find mate, so writing as "
                      << "unpaired (single-ended)\n";
            ++myNumMateFailures;
            writeFastQ(*prevRec, myUnpairedFile,
                       myUnpairedFileNameExt);
            // Save this record to check against the next one.
            prevRec = &samRec;
        }
        else
        {
            // Matching ReadNames.
            // Found the mate.
            ++myNumPairs;
            // Check which is the first in the pair.
            if(SamFlag::isFirstFragment(samRec.getFlag()))
            {
                if(SamFlag::isFirstFragment(prevRec->getFlag()))
                {
                    std::cerr << "Both reads of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(samRec, myFirstFile, myFirstFileNameExt,
                           myFirstRNExt.c_str());
                writeFastQ(*prevRec, mySecondFile, mySecondFileNameExt,
                           mySecondRNExt.c_str());
            }
            else
            {
                if(!SamFlag::isFirstFragment(prevRec->getFlag()))
                {
                    std::cerr << "Neither read of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(*prevRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str());
                writeFastQ(samRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str());
            }
            // No previous record.
            prevRec = NULL;
        }
    }
}
Ejemplo n.º 2
0
int Likelihood::getFlankingCount( string & chr, int pos )
{
	bool status = bam.SetReadSection( chr.c_str(), pos - avr_read_length * 2, pos + avr_read_length/2 );
	if (!status)
		return 0;

	int n = 0;
	SamRecord rec;
	while(bam.ReadRecord(bam_header, rec)) {
		if ( !(rec.getFlag() & 0x2) )
			continue;
		if (IsSupplementary(rec.getFlag()))
			continue;
		if (rec.getReadLength() < avr_read_length / 3)
			continue;
		if (rec.getMapQuality() < MIN_QUALITY)
			continue;
		if (rec.get1BasedPosition() + MIN_CLIP/2 <= pos && rec.get1BasedAlignmentEnd() - MIN_CLIP/2 >= pos )
			n++;
	}

	return n;
}
Ejemplo n.º 3
0
void SamFilter::filterRead(SamRecord& record)
{
    // Filter the read by marking it as unmapped.
    uint16_t flag = record.getFlag(); 
    SamFlag::setUnmapped(flag);
    // Clear N/A flags.
    flag &= ~SamFlag::PROPER_PAIR;
    flag &= ~SamFlag::SECONDARY_ALIGNMENT;
    flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT;
    record.setFlag(flag);
    // Clear Cigar
    record.setCigar("*");
    // Clear mapping quality
    record.setMapQuality(0);
}
Ejemplo n.º 4
0
void Bam2FastQ::writeFastQ(SamRecord& samRec, IFILE filePtr,
                             const char* readNameExt)
{
    static int16_t flag;
    static std::string sequence;
    static String quality;

    if(filePtr == NULL)
    {
        return;
    }

    flag = samRec.getFlag();
    const char* readName = samRec.getReadName();
    sequence = samRec.getSequence();
    quality = samRec.getQuality();
    
    if(SamFlag::isReverse(flag) && myReverseComp)
    {
        // It is reverse, so reverse compliment the sequence
        BaseUtilities::reverseComplement(sequence);
        // Reverse the quality.
        quality.Reverse();
    }
    else
    {
        // Ensure it is all capitalized.
        int seqLen = sequence.size();
        for (int i = 0; i < seqLen; i++)
        {
            sequence[i] = (char)toupper(sequence[i]);
        }
    }
    
    if(myRNPlus)
    {

        ifprintf(filePtr, "@%s%s\n%s\n+%s%s\n%s\n", readName, readNameExt,
                 sequence.c_str(), readName, readNameExt, quality.c_str());
    }
    else
    {
        ifprintf(filePtr, "@%s%s\n%s\n+\n%s\n", readName, readNameExt,
                 sequence.c_str(), quality.c_str());
    }
    // Release the record.
    myPool.releaseRecord(&samRec);
}
Ejemplo n.º 5
0
bool Demux::verify(){ //at least one decoy should be present
	BamReader bamReader(bamFilePath);
	SamRecord samRecord;
	int verifiedDecoys = 0;
	while (	bamReader.getNextRecord(samRecord)) {
		string recordName(samRecord.getReadName());
		recordName = cHandler.decrypt(recordName);
		//clean record name
		int len=recordName.find("$");
		recordName=recordName.substr(0,len);
		//clean ended
		if (isDecoy(recordName)){
			char decoyChromosome[2][20];
			int decoyLocation[2];
			int decoyJobID;

			sscanf(	recordName.c_str(),"DECOY.%[^.].%d_%[^.].%d.%d",
					decoyChromosome[0], decoyLocation, 
					decoyChromosome[1], decoyLocation + 1,  &decoyJobID);

			int pair = 	!SamFlag::isFirstFragment(samRecord.getFlag());
			if (strcmp(decoyChromosome[pair],samRecord.getReferenceName())){
				cout << "Chromosome mismatch\n";
				cout << "Expected: " << decoyChromosome[pair] << endl;
				cout << "Got: " << samRecord.getReferenceName() << endl;
				return false;
			}
			if ( decoyLocation[pair] != samRecord.get0BasedPosition()){
				cout << "Position mismatch\n";
				cout << "Expected: " << decoyLocation[pair] << endl;
				cout << "Got: " << samRecord.get0BasedPosition() << endl;
				return false;
			}
			if (decoyJobID != jobID){
				cout << "Job ID mismatch\n";
				cout << "Expected: " << decoyJobID << endl;
				cout << "Got: " << jobID << endl;
				return false;
			}
			verifiedDecoys++;
		}
    }
    if (verifiedDecoys)
		return true;
	else
		return false;
}
Ejemplo n.º 6
0
void GenomeRegionSeqStats::CalcClusters(String &bamFile, int minMapQuality)
{
  SamFile sam;
  SamRecord samRecord;
  SamFileHeader samHeader;

  if(!sam.OpenForRead(bamFile.c_str()))
    error("Open BAM file %s failed!\n", bamFile.c_str());

  if(!sam.ReadHeader(samHeader)) {
      error("Read BAM file header %s failed!\n", bamFile.c_str());
  }
  
  if(depth.size()==0) depth.resize(referencegenome.sequenceLength());
  
  String contigLabel;
  uint32_t start;
  uint32_t gstart;
  Reset();
  while(sam.ReadRecord(samHeader, samRecord))
    {
      nReads++;
      if(samRecord.getFlag() & SamFlag::UNMAPPED) nUnMapped++;

      if(samRecord.getMapQuality() < minMapQuality) continue;

      CigarRoller cigar(samRecord.getCigar());

      int nonClipSequence = 0;

      if(cigar.size()!=0 && cigar[0].operation==Cigar::softClip)
          nonClipSequence = cigar[0].count;

      contigLabel = samRecord.getReferenceName();
      start = nonClipSequence + samRecord.get0BasedPosition();  // start is 0-based

      gstart = referencegenome.getGenomePosition(contigLabel.c_str(), start);

      if(IsInRegions(contigLabel, start, start+samRecord.getReadLength())) continue;

      for(uint32_t i=gstart; i<gstart+samRecord.getReadLength(); i++)
       if(depth[i]<MAXDP)
        depth[i]++;
      nMappedOutTargets++;
    }
}
Ejemplo n.º 7
0
void Sites::setNewCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec )
{
	if (is_in_coord.size() != NMEI)
		morphError("[Sites::setNewCluster] is_in_coord size error");

	// set info
	new_site.breakp = getEstimatedBreakPoint(rec);
	new_site.rcount = 1;
	new_site.evidence = 1;
	for(int m=0; m<NMEI; m++) {
		new_site.left[m] = 0;
		new_site.right[m] = 0;
	}
	new_site.left_clip_only = 1;
	new_site.right_clip_only = 1;
	new_site.depth = current_depth;
	new_site.depth_add = 1;

	// set position & mtype
	if ( rec.getFlag() & 0x10 )  { // right anchor
		new_site.start = rec.get1BasedPosition();
		new_site.end = rec.get1BasedAlignmentEnd();
		Cigar * myCigar = rec.getCigarInfo();
		int begin_clip = myCigar->getNumBeginClips();
		if ( begin_clip < MIN_CLIP/2)
			new_site.right_clip_only = 0;
		for(int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.right[m] = 1;
		}
	}
	else {
		new_site.start = rec.get1BasedPosition();
		new_site.end = rec.get1BasedAlignmentEnd();
		Cigar * myCigar = rec.getCigarInfo();
		int end_clip = myCigar->getNumEndClips();
		if (end_clip < MIN_CLIP/2)
			new_site.left_clip_only = 0;
		for(int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.left[m] = 1;
		}
	}	
}
Ejemplo n.º 8
0
void Sites::addToCurrentCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec )
{
	if (is_in_coord.size() != NMEI)
		morphError("[Sites::setNewCluster] is_in_coord size error");

	// update breakpoint
	int old_evi = new_site.evidence;
	float a1 = (float)1 / float(old_evi+1);
	int ep = getEstimatedBreakPoint(rec);
	new_site.breakp = round( a1 * (float)ep + (float)new_site.breakp * (1-a1));
	new_site.evidence++; 

	// update position
	if (rec.get1BasedPosition() < new_site.start)
		new_site.start = rec.get1BasedPosition();
	else if (rec.get1BasedAlignmentEnd() > new_site.end)
		new_site.end = rec.get1BasedAlignmentEnd();

	// update info
	if (rec.getFlag() & 0x10) {
		if (new_site.right_clip_only) {
			Cigar * myCigar = rec.getCigarInfo();
			int begin_clip = myCigar->getNumBeginClips();
			if ( begin_clip < MIN_CLIP/2)
				new_site.right_clip_only = 0;	
		}
		for(int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.right[m]++;
		}	
	}
	else {
		if (new_site.left_clip_only) {
			Cigar * myCigar = rec.getCigarInfo();
			int end_clip = myCigar->getNumEndClips();
			if (end_clip < MIN_CLIP/2)
				new_site.left_clip_only = 0;			
		}
		for( int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.left[m]++;
		}
	}
}
Ejemplo n.º 9
0
// given mapping start position
// if anchor on left
//		expected_breakp = position + avr_ins_size / 2;
// if anchor on right
//		expected_breakp = position + read_length - avr_ins_size / 2;
// key = round( expected_breakp / WIN )
int Sites::getEstimatedBreakPoint( SamRecord & rec )
{
	int ep;
	int clen = GetMaxClipLen(rec);

	if ( !rec.getFlag() & 0x10 ) { // left anchor
		if (clen < -MIN_CLIP/2) // end clip of anchor decides position
			ep = rec.get1BasedAlignmentEnd();
		else
			ep = rec.get1BasedPosition() + avr_ins_size / 3;
	}
	else { // right anchor
		if (clen > MIN_CLIP/2)
			ep = rec.get1BasedPosition();
		else
			ep = rec.get1BasedPosition() + avr_read_length - avr_ins_size / 3;
	}
	return ep;
}
Ejemplo n.º 10
0
void GenomeRegionSeqStats::CalcRegionStats(String &bamFile)
{
  SamFile sam;
  SamRecord samRecord;
  SamFileHeader samHeader;

  if(!sam.OpenForRead(bamFile.c_str()))
    error("Open BAM file %s failed!\n", bamFile.c_str());

  if(!sam.ReadHeader(samHeader)) {
      error("Read BAM file header %s failed!\n", bamFile.c_str());
  }
  
  String contigLabel;
  int start, end;
  Reset();
  while(sam.ReadRecord(samHeader, samRecord))
    {
      nReads++;
      if(samRecord.getFlag() & SamFlag::UNMAPPED) nUnMapped++;

      if(contigFinishedCnt>=contigs.size()) continue;

      CigarRoller cigar(samRecord.getCigar());

      int nonClipSequence = 0;

      if(cigar.size()!=0 && cigar[0].operation==Cigar::softClip)
          nonClipSequence = cigar[0].count;

      contigLabel = samRecord.getReferenceName();
      start = nonClipSequence + samRecord.get0BasedPosition();  // start is 0-based
      end = start + samRecord.getReadLength() - 1;
      if(UpdateRegionStats(contigLabel, start, end)) nMapped2Targets++;
    }
    CalcRegionReadCountInGCBins();
    CalcGroupReadCountInGCBins();
    std::cout << "Total reads : " << nReads << std::endl;
}
Ejemplo n.º 11
0
// make a 64-bit genomic coordinate [24bit-chr][32bit-pos][8bit-orientation]
uint64_t getGenomicCoordinate(SamRecord& r) {
  // 64bit string consisting of
  // 24bit refID, 32bit pos, 8bit orientation
  if ( ( r.getReferenceID() < 0 ) || ( r.get0BasedPosition() < 0 ) ) {
    return UNMAPPED_GENOMIC_COORDINATE;
  }
  else {
    return ( ( static_cast<uint64_t>(r.getReferenceID()) << 40 ) | ( static_cast<uint64_t>(r.get0BasedPosition()) << 8 ) | static_cast<uint64_t>( r.getFlag() & 0x0010 ) );
  }
}
Ejemplo n.º 12
0
int Bam2FastQ::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    bool readName = false;
    String refFile = "";
    String firstOut = "";
    String secondOut = "";
    String unpairedOut = "";

    bool interleave = false;
    bool noeof = false;
    bool gzip = false;
    bool params = false;

    myOutBase = "";
    myNumMateFailures = 0;
    myNumPairs = 0;
    myNumUnpaired = 0;
    mySplitRG = false;
    myQField = "";
    myNumQualTagErrors = 0;
    myReverseComp = true;
    myRNPlus = false;
    myFirstRNExt = DEFAULT_FIRST_EXT;
    mySecondRNExt = DEFAULT_SECOND_EXT;
    myCompression = InputFile::DEFAULT;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("readName", &readName)
        LONG_PARAMETER("splitRG", &mySplitRG)
        LONG_STRINGPARAMETER("qualField", &myQField)
        LONG_PARAMETER("merge", &interleave)
        LONG_STRINGPARAMETER("refFile", &refFile)
        LONG_STRINGPARAMETER("firstRNExt", &myFirstRNExt)
        LONG_STRINGPARAMETER("secondRNExt", &mySecondRNExt)
        LONG_PARAMETER("rnPlus", &myRNPlus)
        LONG_PARAMETER("noReverseComp", &myReverseComp)
        LONG_PARAMETER("gzip", &gzip)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Optional OutputFile Names")
        LONG_STRINGPARAMETER("outBase", &myOutBase)
        LONG_STRINGPARAMETER("firstOut", &firstOut)
        LONG_STRINGPARAMETER("secondOut", &secondOut)
        LONG_STRINGPARAMETER("unpairedOut", &unpairedOut)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    if(gzip)
    {
        myCompression = InputFile::GZIP;
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Cannot specify both interleaved & secondOut since secondOut would be N/A.
    if(interleave && !secondOut.IsEmpty())
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n";
        return(-1);
    }

    // Cannot specify both interleaved & secondOut since secondOut would be N/A.
    if(interleave && !secondOut.IsEmpty())
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n";
        return(-1);
    }

    // Cannot specify both splitRG & firstOut/secondOut/unpairedOut
    // since it needs a different file for each RG.
    if(mySplitRG && (!firstOut.IsEmpty() || 
                   !secondOut.IsEmpty() || !unpairedOut.IsEmpty()))
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --splitRG & --firstOut/--secondOut/--unpairedOut.\n";
        std::cerr << "Use --outBase instead.\n";
        return(-1);
    }
    // Cannot specify splitRG & output to stdout.
    if(mySplitRG && (myOutBase[0] == '-'))
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --splitRG & write to stdout.\n";
        return(-1);
    }

    // Check to see if the out file was specified, if not, generate it from
    // the input filename.
    if(myOutBase == "")
    {
        // Just remove the extension from the input filename.
        int extStart = inFile.FastFindLastChar('.');
        if(extStart <= 0)
        {
            myOutBase = inFile;
        }
        else
        {
            myOutBase = inFile.Left(extStart);
        }
    }

    if(mySplitRG)
    {
        std::string fqList = myOutBase.c_str();
        fqList += ".list";
        myFqList = ifopen(fqList.c_str(), "w");
        ifprintf(myFqList, "MERGE_NAME\tFASTQ1\tFASTQ2\tRG\n");
    }

    // Check to see if the first/second/single-ended were specified and
    // if not, set them.
    myFirstFileNameExt = "_1.fastq";
    mySecondFileNameExt = "_2.fastq";
    myUnpairedFileNameExt = ".fastq";
    if(interleave)
    {
        myFirstFileNameExt = "_interleaved.fastq";
        myFirstFileNameExt = "_interleaved.fastq";
    }
    getFileName(firstOut, myFirstFileNameExt);
    getFileName(secondOut, mySecondFileNameExt);
    getFileName(unpairedOut, myUnpairedFileNameExt);

    if(params)
    {
        inputParameters.Status();
    }

    // Open the files for reading/writing.
    // Open prior to opening the output files,
    // so if there is an error, the outputs don't get created.
    SamFile samIn;
    samIn.OpenForRead(inFile, &mySamHeader);
    // Skip non-primary reads.
    samIn.SetReadFlags(0, 0x0100);

    // Open the output files if not splitting RG
    if(!mySplitRG)
    {
        myUnpairedFile = ifopen(unpairedOut, "w", myCompression);

        // Only open the first file if it is different than an already opened file.
        if(firstOut != unpairedOut)
        {
            myFirstFile = ifopen(firstOut, "w", myCompression);
        }
        else
        {
            myFirstFile = myUnpairedFile;
        }

        // If it is interleaved or the 2nd file is not a new name, set it appropriately.
        if(interleave || secondOut == firstOut)
        {
            mySecondFile = myFirstFile;
        }
        else if(secondOut == unpairedOut)
        {
            mySecondFile = myUnpairedFile;
        }
        else
        {
            mySecondFile = ifopen(secondOut, "w", myCompression);
        }
    
        if(myUnpairedFile == NULL)
        {
            std::cerr << "Failed to open " << unpairedOut
                      << " so can't convert bam2FastQ.\n";
            return(-1);
        }
        if(myFirstFile == NULL)
        {
            std::cerr << "Failed to open " << firstOut
                      << " so can't convert bam2FastQ.\n";
            return(-1);
        }
        if(mySecondFile == NULL)
        {
            std::cerr << "Failed to open " << secondOut
                      << " so can't convert bam2FastQ.\n";
            return(-1);
        }
    }

    if((readName) || (strcmp(mySamHeader.getSortOrder(), "queryname") == 0))
    {
        readName = true;
    }
    else
    {
        // defaulting to coordinate sorted.
        samIn.setSortedValidation(SamFile::COORDINATE);
    }

    // Setup the '=' translation if the reference was specified.
    if(!refFile.IsEmpty())
    {
        GenomeSequence* refPtr = new GenomeSequence(refFile);
        samIn.SetReadSequenceTranslation(SamRecord::BASES);
        samIn.SetReference(refPtr);
    }

    SamRecord* recordPtr;
    int16_t samFlag;

    SamStatus::Status returnStatus = SamStatus::SUCCESS;
    while(returnStatus == SamStatus::SUCCESS)
    {
        recordPtr = myPool.getRecord();
        if(recordPtr == NULL)
        {
            // Failed to allocate a new record.
            throw(std::runtime_error("Failed to allocate a new SAM/BAM record"));
        }
        if(!samIn.ReadRecord(mySamHeader, *recordPtr))
        {
            // Failed to read a record.
            returnStatus = samIn.GetStatus();
            continue;
        }

        // Have a record.  Check to see if it is a pair or unpaired read.
        samFlag = recordPtr->getFlag();
        if(SamFlag::isPaired(samFlag))
        {
            if(readName)
            {
                handlePairedRN(*recordPtr);
            }
            else
            {
                handlePairedCoord(*recordPtr);
            }
        }
        else
        {
            ++myNumUnpaired;
            writeFastQ(*recordPtr, myUnpairedFile,
                       myUnpairedFileNameExt);
        }
    }

    // Flush All
    cleanUpMateMap(0, true);

    if(returnStatus == SamStatus::NO_MORE_RECS)
    {
        returnStatus = SamStatus::SUCCESS;
    }

    samIn.Close();
    closeFiles();
    
    // Output the results
    std::cerr << "\nFound " << myNumPairs << " read pairs.\n";
    std::cerr << "Found " << myNumUnpaired << " unpaired reads.\n";
    if(myNumMateFailures != 0)
    {
        std::cerr << "Failed to find mates for " << myNumMateFailures
                  << " reads, so they were written as unpaired\n"
                  << "  (not included in either of the above counts).\n";
    }
    if(myNumQualTagErrors != 0)
    {
        std::cerr << myNumQualTagErrors << " records did not have tag "
                  << myQField.c_str() << " or it was invalid, so the quality field was used for those records.\n";
    }

    return(returnStatus);
}
Ejemplo n.º 13
0
SamStatus::Status ClipOverlap::handleSortedByReadName(SamFile& samIn, 
                                                      SamFile* samOutPtr)
{
    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    // Read the sam records.
    SamRecord* prevSamRecord = NULL;
    SamRecord* samRecord = new SamRecord;
    SamRecord* tmpRecord = new SamRecord;
    if((samRecord == NULL) || (tmpRecord == NULL))
    {
        std::cerr << "Failed to allocate a SamRecord, so exit.\n";
        return(SamStatus::FAIL_MEM);
    }

    // Keep reading records until ReadRecord returns false.
    while(samIn.ReadRecord(mySamHeader, *samRecord))
    {
        int16_t flag = samRecord->getFlag();
        if((flag & myIntExcludeFlags) != 0)
        {
            // This read should not be checked for overlaps.

            // Check if there is a previous SamRecord.
            if(prevSamRecord != NULL)
            {
                // There is a previous record.
                // If it has a different read name, write it.
                if(strcmp(samRecord->getReadName(), 
                          prevSamRecord->getReadName()) != 0)
                {
                    // Different read name, so write the previous record.
                    if((samOutPtr != NULL) && !myOverlapsOnly)
                    {
                        if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord))
                        {
                            // Failed to write a record.
                            fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                            returnStatus = samOutPtr->GetStatus();
                        }
                    }
                    // Clear the previous record info.
                    tmpRecord = prevSamRecord;
                    prevSamRecord = NULL;
                } 
                // If it has the same read name, leave it in case there is another read with that name
            }
            // This record is not being checked for overlaps, so just write it and continue
            if((samOutPtr != NULL) && !myOverlapsOnly)
            {
                if(!samOutPtr->WriteRecord(mySamHeader, *samRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                    returnStatus = samOutPtr->GetStatus();
                }
            }
            continue;
        }

        if(prevSamRecord == NULL)
        {
            // Nothing to compare this record to, so set this
            // record to the previous, and the next record.
            prevSamRecord = samRecord;
            samRecord = tmpRecord;
            tmpRecord = NULL;
            continue;
        }

        // Check if the read name matches the previous read name.
        if(strcmp(samRecord->getReadName(), 
                  prevSamRecord->getReadName()) == 0)
        {
            bool overlap = false;
            // Same Read Name, so check clipping.
            OverlapHandler::OverlapInfo prevClipInfo = 
                myOverlapHandler->getOverlapInfo(*prevSamRecord);
            OverlapHandler::OverlapInfo curClipInfo = 
                myOverlapHandler->getOverlapInfo(*samRecord);
            
            // If either indicate a complete clipping, clip both.
            if((prevClipInfo == OverlapHandler::NO_OVERLAP_WRONG_ORIENT) ||
               (curClipInfo == OverlapHandler::NO_OVERLAP_WRONG_ORIENT))
            {
                overlap = true;
                myOverlapHandler->handleNoOverlapWrongOrientation(*prevSamRecord);
                // Don't update stats since this is the 2nd in the pair
                myOverlapHandler->handleNoOverlapWrongOrientation(*samRecord, 
                                                                  false);
            }
            else if((prevClipInfo == OverlapHandler::OVERLAP) ||
                    (prevClipInfo == OverlapHandler::SAME_START))
            {
                // The previous read starts at or before the current one.
                overlap = true;
                myOverlapHandler->handleOverlapPair(*prevSamRecord,
                                                    *samRecord);
            }
            else if(curClipInfo == OverlapHandler::OVERLAP)
            {
                // The current read starts before the previous one.
                overlap = true;
                myOverlapHandler->handleOverlapPair(*samRecord,
                                                    *prevSamRecord);
            }
            
            // Found a read pair, so write both records if: 
            //   1) output file is specified
            //   AND
            //     2a) all records should be written
            //     OR
            //     2b) the pair overlaps
            if((samOutPtr != NULL) && (!myOverlapsOnly || overlap))
            {
                if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                    returnStatus = samOutPtr->GetStatus();
                }
                if(!samOutPtr->WriteRecord(mySamHeader, *samRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                    returnStatus = samOutPtr->GetStatus();
                }
            }
            // Setup for the next read with no previous.
            tmpRecord = prevSamRecord;
            prevSamRecord = NULL;
            
        }
        else
        {
            // Read name does not match, so write the previous record
            // if we are writing all records.
            if((samOutPtr != NULL) && !myOverlapsOnly)
            {
                if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
                    returnStatus = samOutPtr->GetStatus();
                }
            }
            // Store this record as the previous.
            tmpRecord = prevSamRecord;
            prevSamRecord = samRecord;
            samRecord = tmpRecord;
            tmpRecord = NULL;
        }
    }

    // Write the previous record if there is one.
    if((samOutPtr != NULL) && (prevSamRecord != NULL) && !myOverlapsOnly)
    {
        if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord))
        {
            // Failed to write a record.
            fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage());
            returnStatus = samOutPtr->GetStatus();
        }
        delete prevSamRecord;
    }

    if(samRecord != NULL)
    {
        delete samRecord;
    }
    if(tmpRecord != NULL)
    {
        delete tmpRecord;
    }

    if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
    {
        return(samIn.GetStatus());
    }
    return(returnStatus);
}
Ejemplo n.º 14
0
// Read a record from the currently opened file.
bool SamFile::ReadRecord(SamFileHeader& header, 
                         SamRecord& record)
{
    myStatus = SamStatus::SUCCESS;

    if(myIsOpenForRead == false)
    {
        // File is not open for read
        myStatus.setStatus(SamStatus::FAIL_ORDER, 
                           "Cannot read record since the file is not open for reading");
        throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
        return(false);
    }

    if(myHasHeader == false)
    {
        // The header has not yet been read.
        // TODO - maybe just read the header.
        myStatus.setStatus(SamStatus::FAIL_ORDER, 
                           "Cannot read record since the header has not been read.");
        throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
        return(false);
    }

    // Check to see if a new region has been set.  If so, determine the
    // chunks for that region.
    if(myNewSection)
    {
        if(!processNewSection(header))
        {
            // Failed processing a new section.  Could be an 
            // order issue like the file not being open or the
            // indexed file not having been read.
            // processNewSection sets myStatus with the failure reason.
            return(false);
        }
    }

    // Read until a record is not successfully read or there are no more
    // requested records.
    while(myStatus == SamStatus::SUCCESS)
    {
        record.setReference(myRefPtr);
        record.setSequenceTranslation(myReadTranslation);

        // If reading by index, this method will setup to ensure it is in
        // the correct position for the next record (if not already there).
        // Sets myStatus if it could not move to a good section.
        // Just returns true if it is not setup to read by index.
        if(!ensureIndexedReadPosition())
        {
            // Either there are no more records in the section
            // or it failed to move to the right section, so there
            // is nothing more to read, stop looping.
            break;
        }
        
        // File is open for reading and the header has been read, so read the
        // next record.
        myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
        if(myStatus != SamStatus::SUCCESS)
        {
            // Failed to read the record, so break out of the loop.
            break;
        }

        // Successfully read a record, so check if we should filter it.
        // First check if it is out of the section.  Returns true
        // if not reading by sections, returns false if the record
        // is outside of the section.  Sets status to NO_MORE_RECS if
        // there is nothing left ot read in the section.
        if(!checkRecordInSection(record))
        {
            // The record is not in the section.
            // The while loop will detect if NO_MORE_RECS was set.
            continue;
        }

        // Check the flag for required/excluded flags.
        uint16_t flag = record.getFlag();
        if((flag & myRequiredFlags) != myRequiredFlags)
        {
            // The record does not conatain all required flags, so
            // continue looking.
            continue;
        }
        if((flag & myExcludedFlags) != 0)
        {
            // The record contains an excluded flag, so continue looking.
            continue;
        }

        //increment the record count.
        myRecordCount++;
        
        if(myStatistics != NULL)
        {
            // Statistics should be updated.
            myStatistics->updateStatistics(record);
        }
        
        // Successfully read the record, so check the sort order.
        if(!validateSortOrder(record, header))
        {
            // ValidateSortOrder sets the status on a failure.
            return(false);
        }
        return(true);

    } // End while loop that checks if a desired record is found or failure.

    // Return true if a record was found.
    return(myStatus == SamStatus::SUCCESS);
}
Ejemplo n.º 15
0
bool Recab::processReadApplyTable(SamRecord& samRecord)
{
    static BaseData data;
    static std::string readGroup;
    static std::string aligTypes;

    int seqLen = samRecord.getReadLength();

    uint16_t  flag = samRecord.getFlag();

    // Check if the flag contains an exclude.
    if((flag & myIntApplyExcludeFlags) != 0)
    {
        // Do not apply the recalibration table to this read.
        ++myNumApplySkipped;
        return(false);
    }
    ++myNumApplyReads;
   
    readGroup = samRecord.getString("RG").c_str();

    // Look for the read group in the map.
    // TODO - extra string constructor??
    RgInsertReturn insertRet = 
        myRg2Id.insert(std::pair<std::string, uint16_t>(readGroup, 0));
    if(insertRet.second == true)
    {
        // New element inserted.
        insertRet.first->second = myId2Rg.size();
        myId2Rg.push_back(readGroup);
    }

    data.rgid = insertRet.first->second;

    if(!myQField.IsEmpty())
    {
        // Check if there is an old quality.
        const String* oldQPtr =
            samRecord.getStringTag(myQField.c_str());
        if((oldQPtr != NULL) && (oldQPtr->Length() == seqLen))
        {
            // There is an old quality, so use that.
            myQualityStrings.oldq = oldQPtr->c_str();
        }
        else
        {
            myQualityStrings.oldq = samRecord.getQuality();
        }
    }
    else
    {
        myQualityStrings.oldq = samRecord.getQuality();
    }

    if(myQualityStrings.oldq.length() != (unsigned int)seqLen)
    {
        Logger::gLogger->warning("Quality is not the correct length, so skipping recalibration on that record.");
        return(false);
    }

    myQualityStrings.newq.resize(seqLen);

    ////////////////
    ////// iterate sequence
    ////////////////
    int32_t seqPos = 0;
    int seqIncr = 1;

    bool reverse;
    if(SamFlag::isReverse(flag))
    {
        reverse = true;
        seqPos = seqLen - 1;
        seqIncr = -1;
    }
    else
        reverse = false;

    // Check which read - this will be the same for all positions, so 
    // do this outside of the smaller loop.
    if(!SamFlag::isPaired(flag) || SamFlag::isFirstFragment(flag))
        // Mark as first if it is not paired or if it is the
        // first in the pair.
        data.read = 0;
    else
        data.read = 1;

    // Set unsetbase for curBase.
    // This will be used for the prebase of cycle 0.
    data.curBase = 'K';

    for (data.cycle = 0; data.cycle < seqLen; data.cycle++, seqPos += seqIncr)
    {
        // Set the preBase to the previous cycle's current base.
        // For cycle 0, curBase was set to a default value.
        data.preBase = data.curBase;

        // Get the current base.
        data.curBase = samRecord.getSequence(seqPos);

        if(reverse)
        {
            // Complement the current base.
            data.curBase =
                BaseAsciiMap::base2complement[(unsigned int)(data.curBase)];
        }

        // Get quality
        data.qual = 
            BaseUtilities::getPhredBaseQuality(myQualityStrings.oldq[seqPos]);

        // skip bases with quality below the minimum set.
        if(data.qual < myMinBaseQual)
        {
            myQualityStrings.newq[seqPos] = myQualityStrings.oldq[seqPos];
            continue;
        }

        // Update quality score
        uint8_t qemp = hasherrormodel.getQemp(data);
        qemp = mySqueeze.getQualCharFromQemp(qemp);
        if(qemp > myMaxBaseQualChar)
        {
            qemp = myMaxBaseQualChar;
        }
        myQualityStrings.newq[seqPos] = qemp;
    }

    if(!myStoreQualTag.IsEmpty())
    {
        samRecord.addTag(myStoreQualTag, 'Z', myQualityStrings.oldq.c_str());
    }
    samRecord.setQuality(myQualityStrings.newq.c_str());

    return true;
}
Ejemplo n.º 16
0
bool Recab::processReadBuildTable(SamRecord& samRecord)
{
    static BaseData data;
    static std::string chromosomeName;
    static std::string readGroup;
    static std::string aligTypes;

    int seqLen = samRecord.getReadLength();
    
    // Check if the parameters have been processed.
    if(!myParamsSetup)
    {
        // This throws an exception if the reference cannot be setup.
        processParams();
    }

    uint16_t  flag = samRecord.getFlag();

    if(!SamFlag::isMapped(flag))
    {
        // Unmapped, skip processing
        ++myUnMappedCount;
    }
    else
    {
        // This read is mapped.
        ++myMappedCount;
    }

    if(SamFlag::isSecondary(flag))
    {
        // Secondary read
        ++mySecondaryCount;
    }
    if(SamFlag::isDuplicate(flag))
    {
        ++myDupCount;
    }
    if(SamFlag::isQCFailure(flag))
    {
        ++myQCFailCount;
    }

    // Check if the flag contains an exclude.
    if((flag & myIntBuildExcludeFlags) != 0)
    {
        // Do not use this read for building the recalibration table.
        ++myNumBuildSkipped;
        return(false);
    }

    if(samRecord.getMapQuality() == 0)
    {
        // 0 mapping quality, so skip processing.
        ++myMapQual0Count;
        ++myNumBuildSkipped;
        return(false);
    }
    if(samRecord.getMapQuality() == 255)
    {
        // 255 mapping quality, so skip processing.
        ++myMapQual255Count;
        ++myNumBuildSkipped;
        return(false);
    }
    
    chromosomeName = samRecord.getReferenceName();
    readGroup = samRecord.getString("RG").c_str();

    // Look for the read group in the map.
    // TODO - extra string constructor??
    RgInsertReturn insertRet = 
        myRg2Id.insert(std::pair<std::string, uint16_t>(readGroup, 0));
    if(insertRet.second == true)
    {
        // New element inserted.
        insertRet.first->second = myId2Rg.size();
        myId2Rg.push_back(readGroup);
    }

    data.rgid = insertRet.first->second;


    //reverse
    bool reverse;
    if(SamFlag::isReverse(flag))
        reverse = true;
    else
        reverse = false;

    if(myReferenceGenome == NULL)
    {
        throw std::runtime_error("Failed to setup Reference File.\n");
    }

    genomeIndex_t mapPos = 
        myReferenceGenome->getGenomePosition(chromosomeName.c_str(), 
                                             samRecord.get1BasedPosition());

    if(mapPos==INVALID_GENOME_INDEX)
    {
    	Logger::gLogger->warning("INVALID_GENOME_INDEX (chrom:pos %s:%ld) and record skipped... Reference in BAM is different from the ref used here!", chromosomeName.c_str(), samRecord.get1BasedPosition());

        ++myNumBuildSkipped;
        return false;
    }

    if(!myQField.IsEmpty())
    {
        // Check if there is an old quality.
        const String* oldQPtr = 
            samRecord.getStringTag(myQField.c_str());
        if((oldQPtr != NULL) && (oldQPtr->Length() == seqLen))
        {
            // There is an old quality, so use that.
            myQualityStrings.oldq = oldQPtr->c_str();
        }
        else
        {
            // Tag was not found, so use the current quality.
            ++myNumQualTagErrors;
            if(myNumQualTagErrors == 1)
            {
                Logger::gLogger->warning("Recab: %s tag was not found/invalid, so using the quality field in records without the tag", myQField.c_str());
            }
            myQualityStrings.oldq = samRecord.getQuality();
        }
        //printf("%s\n",samRecord.getQuality());
        //printf("%s:%s\n",myQField.c_str(),temp.c_str());
    }
    else
    {
        myQualityStrings.oldq = samRecord.getQuality();
    }

    if(myQualityStrings.oldq.length() != (unsigned int)seqLen)
    {
        Logger::gLogger->warning("Quality is not the correct length, so skipping recalibration on that record.");
        ++myNumBuildSkipped;
        return(false);
    }

    aligTypes = "";
    Cigar* cigarPtr = samRecord.getCigarInfo();

    if(cigarPtr == NULL)
    {
        Logger::gLogger->warning("Failed to get the cigar");
        ++myNumBuildSkipped;
        return(false);
    }

    // This read will be used for building the recab table.
    ++myNumBuildReads;

    ////////////////
    ////// iterate sequence
    ////////////////
    genomeIndex_t refPos = 0;
    int32_t refOffset = 0;
    int32_t prevRefOffset = Cigar::INDEX_NA;
    int32_t seqPos = 0;
    int seqIncr = 1;
    if(reverse)
    {
        seqPos = seqLen - 1;
        seqIncr = -1;
    }

    // read
    if(!SamFlag::isPaired(flag) || SamFlag::isFirstFragment(flag))
        // Mark as first if it is not paired or if it is the
        // first in the pair.
        data.read = 0;
    else
        data.read = 1;

    // Set unsetbase for curBase.
    // This will be used for the prebase of cycle 0.
    data.curBase = 'K';

    for (data.cycle = 0; data.cycle < seqLen; data.cycle++, seqPos += seqIncr)
    {
        // Store the previous current base in preBase.
        data.preBase = data.curBase;

        // Get the current base before checking if we are going to
        // process this position so it will be set for the next position.
        data.curBase = samRecord.getSequence(seqPos);
        if(reverse)
        {
            // Complement the current base.
            // The prebase is already complemented.
            data.curBase = 
                BaseAsciiMap::base2complement[(unsigned int)(data.curBase)];
        }
        
        // Get the reference offset.
        refOffset = cigarPtr->getRefOffset(seqPos);
        if(refOffset == Cigar::INDEX_NA)
        {
            // Not a match/mismatch, so continue to the next one which will
            // not have a previous match/mismatch.
            // Set previous ref offset to a negative so
            // the next one won't be kept.
            prevRefOffset = -2;
            continue;
        }

        // This one is a match.
        refPos = mapPos + refOffset;

        // Check to see if we should process this position.
        // Do not process if it is cycle 0 and:
        //   1) current base is in dbsnp
        if(data.cycle == 0)
        {
            if(!(myDbsnpFile.IsEmpty()) && myDbSNP[refPos])
            {
                // Save the previous reference offset.
                ++myNumDBSnpSkips;
                prevRefOffset = refOffset;
                continue;
            }
        }
        else
        {
            // Do not process if it is not cycle 0 and:
            //   1) previous reference position not adjacent 
            //      (not a match/mismatch)
            //   2) previous base is in dbsnp
            //   3) current base is in dbsnp
            if((!myKeepPrevNonAdjacent && (refOffset != (prevRefOffset + seqIncr))) ||
               (data.preBase == 'K'))
            {
                // Save the previous reference offset.
                prevRefOffset = refOffset;
                continue;
            }
            if(!(myDbsnpFile.IsEmpty()) && 
               (myDbSNP[refPos] ||
                (!myKeepPrevDbsnp && myDbSNP[refPos - seqIncr])))
            {
                ++myNumDBSnpSkips;
                // Save the previous reference offset.
                prevRefOffset = refOffset;
                continue;
            }
       }
        
        // Save the previous reference offset.
        prevRefOffset = refOffset;

        // Set the reference & read bases in the Covariates
        char refBase = (*myReferenceGenome)[refPos];

        if(BaseUtilities::isAmbiguous(refBase))
        {
            // N reference, so skip it when building the table.
            ++myAmbiguous;
            continue;
        }

        if(reverse)
        {
            refBase = BaseAsciiMap::base2complement[(unsigned int)(refBase)];
        }

        // Get quality char
        data.qual = 
            BaseUtilities::getPhredBaseQuality(myQualityStrings.oldq[seqPos]);

        // skip bases with quality below the minimum set.
        if(data.qual < myMinBaseQual)
        {
            ++mySubMinQual;
            continue;
        }

        if(BaseUtilities::areEqual(refBase, data.curBase)
           && (BaseAsciiMap::base2int[(unsigned int)(data.curBase)] < 4))
            myBMatchCount++;
        else
            myBMismatchCount++;

        hasherrormodel.setCell(data, refBase);
        myBasecounts++;
    }
    return true;
}
Ejemplo n.º 17
0
/*
	if a discordant read is mapped to MEI (overlap with MEI coord)
		add centor of ( anchor_end + 3*avr_ins_var )
	skip unmap & clip
	check 3 types at the same time
*/
void Sites::AddDiscoverSiteFromSingleBam( SingleSampleInfo & si )
{
/*for(int i=0;i<NMEI; i++) {
std::cout << "m=" << i << ": ";
for(map<string, map<int, bool> >::iterator t=meiCoord[m].begin(); t!=meiCoord[m].end(); t++)
std::cout << t->first << "->" << t->second.size() << " ";
std::cout << std::endl;
}*/

	avr_read_length = si.avr_read_length;
	avr_ins_size = si.avr_ins_size;
	min_read_length = avr_read_length / 3;
	current_depth = si.depth;
//	total_depth += current_depth;

	resetDepthAddFlag();
	
	SamFile bam;
	SamFileHeader bam_header;
	OpenBamAndBai( bam,bam_header, si.bam_name );
	for( int i=0; i<pchr_list->size(); i++ ) {
		string chr = (*pchr_list)[i];
//		if ( !single_chr.empty() && chr.compare(single_chr)!=0 )
//			continue;
		if ( siteList.find(chr) == siteList.end() )
			siteList[chr].clear();
//		map<string, map<int, SingleSite> >::iterator pchr = siteList[m].find(chr);
//		map<string, map<int, bool> >::iterator coord_chr_ptr = meiCoord[m].find(chr);
//		if (coord_chr_ptr == meiCoord[m].end())
//			continue;
		bool section_status;
		if (range_start<0) { // no range
			section_status = bam.SetReadSection( chr.c_str() );
			if (!section_status) {
				string str = "Cannot set read section at chr " + chr;
				morphWarning( str );
			}
		}
		else { // set range
			section_status = bam.SetReadSection( chr.c_str(), range_start, range_end );
			if (!section_status) {
				string str = "Cannot set read section at chr " + chr + " " + std::to_string(range_start) + "-" + std::to_string(range_end); 
				morphWarning( str );
			}			
		}
		
		// DO ADDING
//		if (siteList[chr].empty())
//			p_reach_last = 1;
//		else {
//			p_reach_last = 0;
		pnearest = siteList[chr].begin();
//		}
		SingleSite new_site; // temporary cluster. will be added to map later.
		new_site.depth = current_depth;
		bool start_new = 1; // check if need to add new_site to map and start new new_site
		SamRecord rec;
		int between = 0; // count #reads after new_site.end. If end changed, add it to rcount and reset to zero
		while( bam.ReadRecord( bam_header, rec ) ) {
			if (!start_new) {
				if (rec.get1BasedPosition() >= new_site.end)
					between++;
				else
					new_site.rcount++;
			}
			if (rec.getFlag() & 0x2)
				continue;
			if ( OneEndUnmap( rec.getFlag() ) )
				continue;
			if ( IsSupplementary(rec.getFlag()) )
				continue;
			if ( rec.getReadLength() < min_read_length )
				continue;
			if ( rec.getMapQuality() < MIN_QUALITY )
				continue;
			if (chr.compare(rec.getMateReferenceName())==0 && rec.getInsertSize() < abs(avr_ins_size*2))
				continue;
			bool is_mei = 0;
			vector<bool> is_in_coord;
			is_in_coord.resize(3, 0);
			for(int m=0; m<NMEI; m++) {
				map<string, map<int, bool> >::iterator coord_chr_ptr = meiCoord[m].find(rec.getMateReferenceName());
				if (coord_chr_ptr == meiCoord[m].end())
					is_in_coord[m] = 0;
				else
					is_in_coord[m] = isWithinCoord( rec.get1BasedMatePosition(), coord_chr_ptr->second ); // within MEI coord
				if (is_in_coord[m])
					is_mei = 1;
			}
			if (!is_mei)
				continue;
			if (start_new) {
				setNewCluster( is_in_coord, new_site,rec);
				start_new = 0;
				between = 0;
			}
			else { // add to existing cluster
				if ( rec.get1BasedPosition() > new_site.end + avr_ins_size ) { // start new coord
					addClusterToMap(new_site, siteList[chr]);
					setNewCluster( is_in_coord, new_site, rec);
					start_new = 0;
					between = 0;
				}
				else {
					addToCurrentCluster( is_in_coord, new_site, rec);
					new_site.rcount += between;
					between = 0;
				}
			}
		}
		// add last one
		if (!start_new)
			addClusterToMap(new_site, siteList[chr]);
	}
	bam.Close();
}
Ejemplo n.º 18
0
// Add an entry to this pileup element.  
void PileupElementBaseQual::addEntry(SamRecord& record)
{
    // Call the base class:
    PileupElement::addEntry(record);

    if(myRefAllele.empty())
    {
    	genomeIndex_t markerIndex = (*myRefSeq).getGenomePosition(getChromosome(), static_cast<uint32_t>(getRefPosition()+1));
        myRefAllele = (*myRefSeq)[markerIndex];
    }

    // Increment the index
    ++myIndex;
    
    // if the index has gone beyond the allocated space, double the size.
    if(myIndex >= myAllocatedSize)
    {
        char* tempBuffer = (char*)realloc(myBases, myAllocatedSize * 2);
        if(tempBuffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myBases = tempBuffer;
        int8_t* tempInt8Buffer = (int8_t*)realloc(myMapQualities, myAllocatedSize * 2 * sizeof(int8_t));
        if(tempInt8Buffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myMapQualities = tempInt8Buffer; 
        tempInt8Buffer = (int8_t*)realloc(myQualities, myAllocatedSize * 2 * sizeof(int8_t));
        if(tempInt8Buffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myQualities = tempInt8Buffer;
        tempBuffer = (char*)realloc(myStrands, myAllocatedSize * 2);
        if(tempBuffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myStrands = tempBuffer;
        tempInt8Buffer = (int8_t*)realloc(myCycles, myAllocatedSize * 2 * sizeof(int8_t));
        if(tempInt8Buffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myCycles = tempInt8Buffer; 
        int16_t* tempInt16Buffer = (int16_t*)realloc(myGLScores, myAllocatedSize * 2 * sizeof(int16_t));
        if(tempInt8Buffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myGLScores = tempInt16Buffer;
        myAllocatedSize = myAllocatedSize * 2;
    }

    Cigar* cigar = record.getCigarInfo();
    
    if(cigar == NULL)
    {
        throw std::runtime_error("Failed to retrieve cigar info from the record.");
    }

    int32_t readIndex = 
        cigar->getQueryIndex(getRefPosition(), record.get0BasedPosition());

    // If the readPosition is N/A, this is a deletion.
    if(readIndex != CigarRoller::INDEX_NA)
    {
        char base = record.getSequence(readIndex);
        int8_t mapQual = record.getMapQuality();
        //-33 to obtain the PHRED base quality
        char qual = record.getQuality(readIndex) - 33;
        if(qual == UNSET_QUAL)
        {
            qual = ' ';
        }
        char strand = (record.getFlag() & 0x0010) ? 'R' : 'F';
        int cycle = strand == 'F' ? readIndex + 1 : record.getReadLength() -  readIndex;
        myBases[myIndex] = base;
        myMapQualities[myIndex] = mapQual;
        myQualities[myIndex] = qual;
        myStrands[myIndex] = strand;
        myCycles[myIndex] = cycle;
    }
    else if(myAddDelAsBase)
    {
        int8_t mapQual = record.getMapQuality();
        char strand = (record.getFlag() & 0x0010) ? 'R' : 'F';
        myBases[myIndex] = '-';
        myMapQualities[myIndex] = mapQual;
        myQualities[myIndex] = -1;
        myStrands[myIndex] = strand;
        myCycles[myIndex] = -1;
    }
    else
    {
        // Do not add a deletion.
        // Did not add any entries, so decrement the index counter since the
        // index was not used.
        --myIndex;
    }
}
Ejemplo n.º 19
0
// When a record is read, check if it is a duplicate or
// store for future checking.
void Dedup::checkDups(SamRecord& record, uint32_t recordCount)
{
    // Only inside this method if the record is mapped.

    // Get the key for this record.
    static DupKey key;
    static DupKey mateKey;
    key.updateKey(record, getLibraryID(record));

    int flag = record.getFlag(); 
    bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
    int sumBaseQual = getBaseQuality(record);

    int32_t chromID = record.getReferenceID();
    int32_t mateChromID = record.getMateReferenceID();

    // If we are one-chrom and the mate is not on the same chromosome, 
    // mark it as not paired.
    if(myOneChrom && (chromID != mateChromID))
    {
        recordPaired = false;
    }
    
    // Look in the map to see if an entry for this key exists.
    FragmentMapInsertReturn ireturn = 
        myFragmentMap.insert(std::make_pair(key, ReadData()));

    ReadData* readData = &(ireturn.first->second);

    // Mark this record's data in the fragment record if this is the first
    // entry or if it is a duplicate and the old record is not paired and 
    // the new record is paired or the has a higher quality.
    if((ireturn.second == true) ||
       ((readData->paired == false) && 
        (recordPaired || (sumBaseQual > readData->sumBaseQual))))
    {
        // If there was a previous record, mark it duplicate and release
        // the old record
        if(ireturn.second == false)
        {
            // Mark the old record as a DUPLICATE!
            handleDuplicate(readData->recordIndex, readData->recordPtr);
        }
        // Store this record for later duplicate checking.
        readData->sumBaseQual = sumBaseQual;
        readData->recordIndex = recordCount;
        readData->paired = recordPaired;
        if(recordPaired)
        {
            readData->recordPtr = NULL;
        }
        else
        {
            readData->recordPtr = &record;
        }
    }
    else
    {
        // The old record is not a duplicate so the new record is
        // a duplicate if it is not paired.
        if(recordPaired == false)
        {
            // This record is a duplicate, so mark it and release it.
            handleDuplicate(recordCount, &record);
        }
    }

    // Only paired processing is left, so return if not paired.
    if(recordPaired == false)
    {
        // Not paired, no more operations required, so return.
        return;
    }
    
    // This is a paired record, so check for its mate.
    uint64_t readPos = 
        SamHelper::combineChromPos(chromID,
                                   record.get0BasedPosition());
    uint64_t matePos =
        SamHelper::combineChromPos(mateChromID, 
                                   record.get0BasedMatePosition());
    SamRecord* mateRecord = NULL;
    int mateIndex = 0;
    
    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate map is stored by the mate position, so look for this 
        // record's position.
        // The mate should be in the mate map, so find it.
        std::pair<MateMap::iterator,MateMap::iterator> matches =
            myMateMap.equal_range(readPos);
        // Loop through the elements that matched the pos looking for the mate.
        for(MateMap::iterator iter = matches.first; 
            iter != matches.second; iter++)
        {
            if(strcmp((*iter).second.recordPtr->getReadName(), 
                      record.getReadName()) == 0)
            {
                // Found the match.
                ReadData* mateData = &((*iter).second);
                // Update the quality and track the mate record and index.
                sumBaseQual += mateData->sumBaseQual;
                mateIndex = mateData->recordIndex;
                mateRecord = mateData->recordPtr;
                // Remove the entry from the map.
                myMateMap.erase(iter);
                break;
            }
        }
    }
    if((mateRecord == NULL) && (matePos >= readPos))
    {
        // Haven't gotten to the mate yet, so store this record.
        MateMap::iterator mateIter = 
            myMateMap.insert(std::make_pair(matePos, ReadData()));
        mateIter->second.sumBaseQual = sumBaseQual;
        mateIter->second.recordPtr = &record;
        mateIter->second.recordIndex = recordCount;
        // No more processing for this record is necessary.
        return;
    }

    if(mateRecord == NULL)
    {
        // Passed the mate, but it was not found.
        handleMissingMate(&record);
        return;
    }

    // Make the paired key.
    mateKey.updateKey(*mateRecord, getLibraryID(*mateRecord));
    PairedKey pkey(key, mateKey);

    // Check to see if this pair is a duplicate.
    PairedMapInsertReturn pairedReturn = 
        myPairedMap.insert(std::make_pair(pkey,PairedData()));
    PairedData* storedPair = &(pairedReturn.first->second);

    // Get the index for "record 1" - the one with the earlier coordinate.
    int record1Index = getFirstIndex(key, recordCount,
                                          mateKey, mateIndex);

    // Check if we have already found a duplicate pair.
    // If there is no duplicate found, there is nothing more to do.
    if(pairedReturn.second == false)
    {
        // Duplicate found.
        bool keepStored = true;
        if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
        {
            // The new pair has higher quality, so keep that.
            keepStored = false;
        }
        else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
        {
            // Same quality, so keep the one with the earlier record1Index.
            if(record1Index < storedPair->record1Index)
            {
                // The new pair has an earlier lower coordinate read,
                // so keep that.
                keepStored = false;
            }
        }
        // Check to see which one should be kept by checking qualities.
        if(keepStored)
        {
            // The old pair had higher quality so mark the new pair as a
            // duplicate and release them.
            handleDuplicate(mateIndex, mateRecord);
            handleDuplicate(recordCount, &record);
        }
        else
        {
            // The new pair has higher quality, so keep that.
            // First mark the previous one as duplicates and release them.
            handleDuplicate(storedPair->record1Index, storedPair->record1Ptr);
            handleDuplicate(storedPair->record2Index, storedPair->record2Ptr);
            // Store this pair's information.
            if(record1Index == mateIndex)
            {
                // Mate has a lower coordinate, so make mate
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Ptr = mateRecord;
                storedPair->record2Ptr = &record;
                storedPair->record1Index = mateIndex;
                storedPair->record2Index = recordCount;
            }
            else
            {
                // This record has a lower coordinate, so make it
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Ptr = &record;
                storedPair->record2Ptr = mateRecord;
                storedPair->record1Index = recordCount;
                storedPair->record2Index = mateIndex;
            }
        }
    }
    else
    {
        // Store this pair's information.
        storedPair->sumBaseQual = sumBaseQual;

        if(record1Index == mateIndex)
        {
            // Mate has a lower coordinate, so make mate
            // record1.
            storedPair->record1Ptr = mateRecord;
            storedPair->record2Ptr = &record;
            storedPair->record1Index = mateIndex;
            storedPair->record2Index = recordCount;
        }
        else
        {
            // This record has a lower coordinate, so make it
            // record1.
            storedPair->record1Ptr = &record;
            storedPair->record2Ptr = mateRecord;
            storedPair->record1Index = recordCount;
            storedPair->record2Index = mateIndex;
        }
    }
}
Ejemplo n.º 20
0
// When a record is read, check if it is a duplicate or
// store for future checking.
void Dedup_LowMem::checkDups(SamRecord& record, uint32_t recordCount)
{
    // Only inside this method if the record is mapped.

    // Get the key for this record.
    static DupKey key;
    key.initKey(record, getLibraryID(record));

    int flag = record.getFlag();
    bool recordPaired = SamFlag::isPaired(flag) && SamFlag::isMateMapped(flag);
    int sumBaseQual = getBaseQuality(record);

    int32_t chromID = record.getReferenceID();
    int32_t mateChromID = record.getMateReferenceID();

    // If we are one-chrom and the mate is not on the same chromosome,
    // mark it as not paired.
    if(myOneChrom && (chromID != mateChromID))
    {
        recordPaired = false;
    }

    // Look in the fragment map to see if an entry for this key exists.
    FragmentMapInsertReturn ireturn =
        myFragmentMap.insert(std::make_pair(key, FragData()));

    FragData* fragData = &(ireturn.first->second);

    // Enter the new record in the fragData if (any of the below):
    // 1) there is no previous entry for this key (ireturn.second == true)
    // or
    // 2) the previous entry is not paired
    //    AND
    //     a) the new record is paired
    //     or
    //     b) the new record has higher quality
    if((ireturn.second == true) ||
            ((fragData->paired == false) &&
             (recordPaired || (sumBaseQual > fragData->sumBaseQual))))
    {
        // Check if this is a new key.
        if(ireturn.second == true)
        {
            // New entry, so build the recalibration table now.
            if(myDoRecab)
            {
                myRecab.processReadBuildTable(record);
            }
        }
        else if(fragData->paired == false)
        {
            // There was a previous record and it is not paired,
            // so mark it as a duplicate.
            // Duplicate checking/marking for pairs is handled below.
            handleDuplicate(fragData->recordIndex);
        }

        // Store this record for later duplicate checking.
        fragData->sumBaseQual = sumBaseQual;
        fragData->recordIndex = recordCount;
        fragData->paired = recordPaired;
    }
    else
    {
        // Leave the old record in fragData.
        // If the new record is not paired, handle it as a duplicate.
        if(recordPaired == false)
        {
            // This record is a duplicate, so mark it and release it.
            handleDuplicate(recordCount);
        }
    }

    // Only paired processing is left, so return if not paired.
    if(recordPaired == false)
    {
        // Not paired, no more operations required, so return.
        return;
    }

    // This is a paired record, so check for its mate.
    uint64_t readPos =
        SamHelper::combineChromPos(chromID,
                                   record.get0BasedPosition());
    uint64_t matePos =
        SamHelper::combineChromPos(mateChromID,
                                   record.get0BasedMatePosition());
    int mateIndex = -1;
    MateData* mateData = NULL;

    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate map is stored by the mate position, so look for this
        // record's position.
        // The mate should be in the mate map, so find it.
        std::pair<MateMap::iterator,MateMap::iterator> matches =
            myMateMap.equal_range(readPos);
        // Loop through the elements that matched the pos looking for the mate.
        for(MateMap::iterator iter = matches.first;
                iter != matches.second; iter++)
        {
            if(strcmp((*iter).second.readName.c_str(),
                      record.getReadName()) == 0)
            {
                // Found the match.
                mateData = &((*iter).second);
                // Update the quality and track the mate record and index.
                sumBaseQual += mateData->sumBaseQual;
                mateIndex = mateData->recordIndex;
                // Remove the entry from the map.
                myMateMap.erase(iter);
                break;
            }
        }
    }
    if(mateData == NULL)
    {
        if(matePos >= readPos)
        {
            // Haven't gotten to the mate yet, so store this record.
            MateMap::iterator mateIter =
                myMateMap.insert(std::make_pair(matePos, MateData()));
            mateIter->second.sumBaseQual = sumBaseQual;
            mateIter->second.recordIndex = recordCount;
            mateIter->second.key.copy(key);
            mateIter->second.readName = record.getReadName();
        }
        else
        {
            // Passed the mate, but it was not found.
            handleMissingMate(record.getReferenceID(), record.getMateReferenceID());
        }
        return;
    }

    // Make the paired key.
    PairedKey pkey(key, mateData->key);

    // Check to see if this pair is a duplicate.
    PairedMapInsertReturn pairedReturn =
        myPairedMap.insert(std::make_pair(pkey,PairedData()));
    PairedData* storedPair = &(pairedReturn.first->second);

    // Get the index for "record 1" - the one with the earlier coordinate.
    int record1Index = getFirstIndex(key, recordCount,
                                     mateData->key, mateIndex);

    // Check if we have already found a duplicate pair.
    // If there is no duplicate found, there is nothing more to do.
    if(pairedReturn.second == false)
    {
        // Duplicate found.
        bool keepStored = true;
        if(pairedReturn.first->second.sumBaseQual < sumBaseQual)
        {
            // The new pair has higher quality, so keep that.
            keepStored = false;
        }
        else if(pairedReturn.first->second.sumBaseQual == sumBaseQual)
        {
            // Same quality, so keep the one with the earlier record1Index.
            if(record1Index < storedPair->record1Index)
            {
                // The new pair has an earlier lower coordinate read,
                // so keep that.
                keepStored = false;
            }
        }
        // Check to see which one should be kept by checking qualities.
        if(keepStored)
        {
            // The old pair had higher quality so mark the new pair as a
            // duplicate and release them.
            handleDuplicate(mateIndex);
            handleDuplicate(recordCount);
        }
        else
        {
            // The new pair has higher quality, so keep that.
            // First mark the previous one as duplicates and release them.
            handleDuplicate(storedPair->record1Index);
            handleDuplicate(storedPair->record2Index);
            // Store this pair's information.
            if(record1Index == mateIndex)
            {
                // Mate has a lower coordinate, so make mate
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Index = mateIndex;
                storedPair->record2Index = recordCount;
            }
            else
            {
                // This record has a lower coordinate, so make it
                // record1.
                storedPair->sumBaseQual = sumBaseQual;
                storedPair->record1Index = recordCount;
                storedPair->record2Index = mateIndex;
            }
        }
    }
    else
    {
        // Store this pair's information.
        storedPair->sumBaseQual = sumBaseQual;

        if(record1Index == mateIndex)
        {
            // Mate has a lower coordinate, so make mate
            // record1.
            storedPair->record1Index = mateIndex;
            storedPair->record2Index = recordCount;
        }
        else
        {
            // This record has a lower coordinate, so make it
            // record1.
            storedPair->record1Index = recordCount;
            storedPair->record2Index = mateIndex;
        }
    }
}
Ejemplo n.º 21
0
void Bam2FastQ::handlePairedCoord(SamRecord& samRec)
{
    static uint64_t readPos;
    static uint64_t matePos;
    static SamRecord* mateRec;

    // This is a paired record, so check for its mate.
    readPos = SamHelper::combineChromPos(samRec.getReferenceID(),
                                         samRec.get0BasedPosition());
    matePos = SamHelper::combineChromPos(samRec.getMateReferenceID(), 
                                         samRec.get0BasedMatePosition());
 
    // Check to see if the mate is prior to this record.
    if(matePos <= readPos)
    {
        // The mate is prior to this position, so look for this record.
        mateRec = myMateMap.getMate(samRec);
        if(mateRec == NULL)
        {
            // If they are the same position, add it to the map.
            if(matePos == readPos)
            {
                myMateMap.add(samRec);

                // Check to see if the mate map can be cleaned up prior
                // to this position.
                cleanUpMateMap(readPos);
            }
            else
            {
                // Paired Read, but could not find mate.
                std::cerr << "Paired Read, " << samRec.getReadName()
                          << " but couldn't find mate, so writing as "
                          << "unpaired (single-ended)\n";
                ++myNumMateFailures;
                writeFastQ(samRec, myUnpairedFile, myUnpairedFileNameExt);
            }
        }
        else
        {
            // Found the mate.
            ++myNumPairs;
            // Check which is the first in the pair.
            if(SamFlag::isFirstFragment(samRec.getFlag()))
            {
                if(SamFlag::isFirstFragment(mateRec->getFlag()))
                {
                    std::cerr << "Both reads of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(samRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str());
                writeFastQ(*mateRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str());
            }
            else
            {
                if(!SamFlag::isFirstFragment(mateRec->getFlag()))
                {
                    std::cerr << "Neither read of " << samRec.getReadName()
                              << " are first fragment, so "
                              << "splitting one to be in the 2nd fastq.\n";
                }
                writeFastQ(*mateRec, myFirstFile, myFirstFileNameExt, myFirstRNExt.c_str());
                writeFastQ(samRec, mySecondFile, mySecondFileNameExt, mySecondRNExt.c_str());
            }
        }
    }
    else
    {
        // Haven't gotten to the mate yet, so store it.
        myMateMap.add(samRec);

        // Check to see if the mate map can be cleaned up.
        cleanUpMateMap(readPos);
    }
}
Ejemplo n.º 22
0
// main function
int TrimBam::execute(int argc, char ** argv)
{
  SamFile samIn;
  SamFile samOut;
  int numTrimBaseL = 0;
  int numTrimBaseR = 0;
  bool noeof = false;
  bool ignoreStrand = false;
  bool noPhoneHome = false;
  std::string inName = "";
  std::string outName = "";

  if ( argc < 5 ) {
    usage();
    std::cerr << "ERROR: Incorrect number of parameters specified\n";
    return(-1);
  }
  inName = argv[2];
  outName = argv[3];

  static struct option getopt_long_options[] = {
      // Input options
      { "left", required_argument, NULL, 'L'},
      { "right", required_argument, NULL, 'R'},
      { "ignoreStrand", no_argument, NULL, 'i'},
      { "noeof", no_argument, NULL, 'n'},
      { "noPhoneHome", no_argument, NULL, 'p'},
      { "nophonehome", no_argument, NULL, 'P'},
      { "phoneHomeThinning", required_argument, NULL, 't'},
      { "phonehomethinning", required_argument, NULL, 'T'},
      { NULL, 0, NULL, 0 },
  };
  
  int argIndex = 4;
  if(argv[argIndex][0] != '-')
  {
      // This is the number of bases to trim off both sides
      // so convert to a number.
      numTrimBaseL = atoi(argv[argIndex]);
      numTrimBaseR = numTrimBaseL;
      ++argIndex;
  }

  int c = 0;
  int n_option_index = 0;
  // Process any additional parameters
  while ( ( c = getopt_long(argc, argv,
                            "L:R:in", getopt_long_options, &n_option_index) )
          != -1 )
  {
      switch(c) 
      {
          case 'L':
              numTrimBaseL = atoi(optarg);
              break;
          case 'R':
              numTrimBaseR = atoi(optarg);
              break;
          case 'i':
              ignoreStrand = true;
              break;
          case 'n':
              noeof = true;
              break;
          case 'p':
          case 'P':
              noPhoneHome = true;
              break;
          case 't':
          case 'T':
              PhoneHome::allThinning = atoi(optarg);
              break;
          default:
              fprintf(stderr,"ERROR: Unrecognized option %s\n",
                      getopt_long_options[n_option_index].name);
              return(-1);
      }
  }

  if(!noPhoneHome)
  {
      PhoneHome::checkVersion(getProgramName(), VERSION);
  }
  
  if(noeof)
  {
      // Set that the eof block is not required.
      BgzfFileType::setRequireEofBlock(false);
  }

  if ( ! samIn.OpenForRead(inName.c_str()) ) {
      fprintf(stderr, "***Problem opening %s\n",inName.c_str());
    return(-1);
  }

  if(!samOut.OpenForWrite(outName.c_str())) {
    fprintf(stderr, "%s\n", samOut.GetStatusMessage());
    return(samOut.GetStatus());
  }
  
  fprintf(stderr,"Arguments in effect: \n");
  fprintf(stderr,"\tInput file : %s\n",inName.c_str());
  fprintf(stderr,"\tOutput file : %s\n",outName.c_str());
  if(numTrimBaseL == numTrimBaseR)
  {
      fprintf(stderr,"\t#Bases to trim from each side : %d\n", numTrimBaseL);
  }
  else
  {
      fprintf(stderr,"\t#Bases to trim from the left of forward strands : %d\n",
              numTrimBaseL);
      fprintf(stderr,"\t#Bases to trim from the right of forward strands: %d\n",
              numTrimBaseR);
      if(!ignoreStrand)
      {
          // By default, reverse strands are treated the opposite.
          fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n",
                  numTrimBaseR);
          fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n",
                  numTrimBaseL);
      }
      else
      {
          // ignore strand, treating forward & reverse strands the same
          fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n",
                  numTrimBaseL);
          fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n",
                  numTrimBaseR);
      }
  }
 
   // Read the sam header.
   SamFileHeader samHeader;
   if(!samIn.ReadHeader(samHeader))
   {
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
      return(samIn.GetStatus());
   }

   // Write the sam header.
   if(!samOut.WriteHeader(samHeader))
   {
      fprintf(stderr, "%s\n", samOut.GetStatusMessage());
      return(samOut.GetStatus());     
   }

   SamRecord samRecord;
   char seq[65536];
   char qual[65536];
   int i, len;

   // Keep reading records until ReadRecord returns false.
   while(samIn.ReadRecord(samHeader, samRecord)) {
     // Successfully read a record from the file, so write it.
     strcpy(seq,samRecord.getSequence());
     strcpy(qual,samRecord.getQuality());

     // Number of bases to trim from the left/right,
     // set based on ignoreStrand flag and strand info.
     int trimLeft = numTrimBaseL;
     int trimRight = numTrimBaseR;
     if(!ignoreStrand)
     {
         if(SamFlag::isReverse(samRecord.getFlag()))
         {
             // We are reversing the reverse reads,
             // so swap the left & right trim counts.
             trimRight = numTrimBaseL;
             trimLeft = numTrimBaseR;
         }
     }

     len = strlen(seq);
     // Do not trim if sequence is '*'
     if ( strcmp(seq, "*") != 0 ) {
       bool qualValue = true;
       if(strcmp(qual, "*") == 0)
       {
           qualValue = false;
       }
       int qualLen = strlen(qual);
       if ( (qualLen != len) && qualValue ) {
         fprintf(stderr,"ERROR: Sequence and Quality have different length\n");
         return(-1);
       }
       if ( len < (trimLeft + trimRight) ) {
         // Read Length is less than the total number of bases to trim,
         // so trim the entire read.
         for(i=0; i < len; ++i) {
           seq[i] = 'N';
           if ( qualValue ) {
             qual[i] = '!';
           }
         }
       }
       else
       {
           // Read Length is larger than the total number of bases to trim,
           // so trim from the left, then from the right.
           for(i=0; i < trimLeft; ++i)
           {
               // Trim the bases from the left.
               seq[i] = 'N';
               if ( qualValue )
               {
                   qual[i] = '!';
               }
           }
           for(i = 0; i < trimRight; i++)
           {
               seq[len-i-1] = 'N';
               if(qualValue)
               {
                   qual[len-i-1] = '!';
               }
           }
       }
       samRecord.setSequence(seq);
       samRecord.setQuality(qual);
     }

     if(!samOut.WriteRecord(samHeader, samRecord)) {
         // Failed to write a record.
       fprintf(stderr, "Failure in writing record %s\n", samOut.GetStatusMessage());
       return(-1);
     }
   }
   
   if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
   {
      // Failed to read a record.
      fprintf(stderr, "%s\n", samIn.GetStatusMessage());
   }   
   
   std::cerr << std::endl << "Number of records read = " << 
     samIn.GetCurrentRecordCount() << std::endl;
   std::cerr << "Number of records written = " << 
     samOut.GetCurrentRecordCount() << std::endl;

   if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
   {
     // Failed reading a record.
     return(samIn.GetStatus());
   }

   // Since the reads were successful, return the status based
   samIn.Close();
   samOut.Close();
   return 0;
}
Ejemplo n.º 23
0
int GapInfo::processFile(const char* inputFileName, const char* outputFileName,
                         const char* refFile, bool detailed,
                         bool checkFirst, bool checkStrand)
{
    // Open the file for reading.
    SamFile samIn;
    samIn.OpenForRead(inputFileName);

    // Read the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    SamRecord samRecord;

    GenomeSequence* refPtr = NULL;
    if(strcmp(refFile, "") != 0)
    {
        refPtr = new GenomeSequence(refFile);
    }

    IFILE outFile = ifopen(outputFileName, "w");

    // Map for summary.
    std::map<int, int> gapInfoMap;


    // Keep reading records until ReadRecord returns false.
    while(samIn.ReadRecord(samHeader, samRecord))
    {
        uint16_t samFlags = samRecord.getFlag();

        if((!SamFlag::isMapped(samFlags)) || 
           (!SamFlag::isMateMapped(samFlags)) ||
           (!SamFlag::isPaired(samFlags)) ||
           (samFlags & SamFlag::SECONDARY_ALIGNMENT) || 
           (SamFlag::isDuplicate(samFlags)) ||
           (SamFlag::isQCFailure(samFlags)))
        {
            // unmapped, mate unmapped, not paired,
            // not the primary alignment,
            // duplicate, fails vendor quality check 
            continue;
        }

        // No gap info if the chromosome names are different or
        // are unknown.
        int32_t refID = samRecord.getReferenceID();
        if((refID != samRecord.getMateReferenceID()) || (refID == -1))
        {
            continue;
        }

        int32_t readStart = samRecord.get0BasedPosition();
        int32_t mateStart = samRecord.get0BasedMatePosition();

        // If the mate starts first, then the pair was processed by
        // the mate.
        if(mateStart < readStart)
        {
            continue;
        }
        if((mateStart == readStart) && (SamFlag::isReverse(samFlags)))
        {
            // read and mate start at the same position, so 
            // only process the forward strand.
            continue;
        }

        // Process this read pair.
        int32_t readEnd = samRecord.get0BasedAlignmentEnd();
        
        int32_t gapSize = mateStart - readEnd - 1;

        if(detailed)
        {
            // Output the gap info.
            ifprintf(outFile, "%s\t%d\t%d", 
                     samRecord.getReferenceName(), readEnd+1, gapSize);
            
            // Check if it is not the first or if it is not the forward strand.
            if(checkFirst && !SamFlag::isFirstFragment(samFlags))
            {
                ifprintf(outFile, "\tNotFirst");
            }
            if(checkStrand && SamFlag::isReverse(samFlags))
            {
                ifprintf(outFile, "\tReverse");
            }
            ifprintf(outFile, "\n");
        }
        else
        {
            // Summary.
            // Skip reads that are not the forward strand.
            if(SamFlag::isReverse(samFlags))
            {
                // continue
                continue;
            }

            // Forward.
            // Check the reference for 'N's.
            if(refPtr != NULL)
            {
                genomeIndex_t chromStartIndex = 
                    refPtr->getGenomePosition(samRecord.getReferenceName());
                if(chromStartIndex == INVALID_GENOME_INDEX)
                {
                    // Invalid position, so continue to the next one.
                    continue;
                }
                bool skipRead = false;
                for(int i = readEnd + 1; i < mateStart; i++)
                {
                    if((*refPtr)[i] == 'N')
                    {
                        // 'N' in the reference, so continue to the next read.
                        skipRead = true;
                        break;
                    }
                }
                if(skipRead)
                {
                    continue;
                }
            }
            
            // Update the gapInfo.
            gapInfoMap[gapSize]++;
        }
    }

    if(!detailed)
    {
        // Output the summary.
        ifprintf(outFile, "GapSize\tNumPairs\n");
        for(std::map<int,int>::iterator iter = gapInfoMap.begin(); 
            iter != gapInfoMap.end(); iter++)
        {
            ifprintf(outFile, "%d\t%d\n", (*iter).first, (*iter).second);
        }
    }
    

    SamStatus::Status returnStatus = samIn.GetStatus();
    if(returnStatus == SamStatus::NO_MORE_RECS)
    {
        return(SamStatus::SUCCESS);
    }
    return(returnStatus);
}
Ejemplo n.º 24
0
void Bam2FastQ::writeFastQ(SamRecord& samRec, IFILE filePtr,
                           const std::string& fileNameExt, const char* readNameExt)
{
    static int16_t flag;
    static std::string sequence;
    static String quality;
    static std::string rg;
    static std::string rgFastqExt;
    static std::string rgListStr;
    static std::string fileName;
    static std::string fq2;
    if(mySplitRG)
    {
        rg = samRec.getString("RG").c_str();
        rgFastqExt = rg + fileNameExt;

        OutFastqMap::iterator it;
        it = myOutFastqs.find(rgFastqExt);
        if(it == myOutFastqs.end())
        {
            // New file.
            fileName = myOutBase.c_str();
            if(rg != "")
            {
                fileName += '.';
            }
            else
            {
                rg = ".";
            }
            fileName += rgFastqExt;
            filePtr = ifopen(fileName.c_str(), "w", myCompression);
            myOutFastqs[rgFastqExt] = filePtr;

            if(fileNameExt != mySecondFileNameExt)
            {
                // first end.
                const char* sm = mySamHeader.getRGTagValue("SM", rg.c_str());
                if(strcmp(sm, "") == 0){sm = myOutBase.c_str();}

                rgListStr.clear();
                SamHeaderRG* rgPtr = mySamHeader.getRG(rg.c_str());
                if((rgPtr == NULL) || (!rgPtr->appendString(rgListStr)))
                {
                    // No RG info for this record.
                    rgListStr = ".\n";
                }
                fq2 = ".";
                if(fileNameExt == myFirstFileNameExt)
                {
                    fq2 = myOutBase.c_str();
                    if(rg != ".")
                    {
                        fq2 += '.';
                        fq2 += rg;
                    }
                    fq2 += mySecondFileNameExt;
                }
                ifprintf(myFqList, "%s\t%s\t%s\t%s",
                         sm, fileName.c_str(), fq2.c_str(),
                         rgListStr.c_str());
            }
        }
        else
        {
            filePtr = it->second;
        }
    }
    if(filePtr == NULL)
    {
        throw(std::runtime_error("Programming ERROR/EXITING: Bam2FastQ filePtr not set."));
        return;
    }

    flag = samRec.getFlag();
    const char* readName = samRec.getReadName();
    sequence = samRec.getSequence();
    if(myQField.IsEmpty())
    {
        // Read the quality from the quality field
        quality = samRec.getQuality();
    }
    else
    {
        // Read Quality from the specified tag
        const String* qTagPtr = samRec.getStringTag(myQField.c_str());
        if((qTagPtr != NULL) && (qTagPtr->Length() == (int)sequence.length()))
        {
            // Use the tag value for quality
            quality = qTagPtr->c_str();
        }
        else
        {
            // Tag was not found, so use the quality field.
            ++myNumQualTagErrors;
            if(myNumQualTagErrors == 1)
            {
                std::cerr << "Bam2FastQ: " << myQField.c_str() 
                          << " tag was not found/invalid, so using the quality field in records without the tag\n";
            }
            quality = samRec.getQuality();
        }
    }
    
    if(SamFlag::isReverse(flag) && myReverseComp)
    {
        // It is reverse, so reverse compliment the sequence
        BaseUtilities::reverseComplement(sequence);
        // Reverse the quality.
        quality.Reverse();
    }
    else
    {
        // Ensure it is all capitalized.
        int seqLen = sequence.size();
        for (int i = 0; i < seqLen; i++)
        {
            sequence[i] = (char)toupper(sequence[i]);
        }
    }
    
    if(myRNPlus)
    {

        ifprintf(filePtr, "@%s%s\n%s\n+%s%s\n%s\n", readName, readNameExt,
                 sequence.c_str(), readName, readNameExt, quality.c_str());
    }
    else
    {
        ifprintf(filePtr, "@%s%s\n%s\n+\n%s\n", readName, readNameExt,
                 sequence.c_str(), quality.c_str());
    }
    // Release the record.
    myPool.releaseRecord(&samRec);
}
Ejemplo n.º 25
0
void validateRead1ModQuality(SamRecord& samRecord)
{
    //////////////////////////////////////////
    // Validate Record 1
    // Create record structure for validating.
    int expectedBlockSize = 89;
    const char* expectedReferenceName = "1";
    const char* expectedMateReferenceName = "1";
    const char* expectedMateReferenceNameOrEqual = "=";

    bamRecordStruct* expectedRecordPtr =
        (bamRecordStruct *) malloc(expectedBlockSize + sizeof(int));

    char tag[3];
    char type;
    void* value;
    bamRecordStruct* bufferPtr;
    unsigned char* varPtr;

    expectedRecordPtr->myBlockSize = expectedBlockSize;
    expectedRecordPtr->myReferenceID = 0;
    expectedRecordPtr->myPosition = 1010;
    expectedRecordPtr->myReadNameLength = 23;
    expectedRecordPtr->myMapQuality = 0;
    expectedRecordPtr->myBin = 4681;
    expectedRecordPtr->myCigarLength = 2;
    expectedRecordPtr->myFlag = 73;
    expectedRecordPtr->myReadLength = 5;
    expectedRecordPtr->myMateReferenceID = 0;
    expectedRecordPtr->myMatePosition = 1010;
    expectedRecordPtr->myInsertSize = 0;
   
    // Check the alignment end
    assert(samRecord.get0BasedAlignmentEnd() == 1016);
    assert(samRecord.get1BasedAlignmentEnd() == 1017);
    assert(samRecord.getAlignmentLength() == 7);
    assert(samRecord.get0BasedUnclippedStart() == 1010);
    assert(samRecord.get1BasedUnclippedStart() == 1011);
    assert(samRecord.get0BasedUnclippedEnd() == 1016);
    assert(samRecord.get1BasedUnclippedEnd() == 1017);

    // Check the accessors.
    assert(samRecord.getBlockSize() == expectedRecordPtr->myBlockSize);
    assert(samRecord.getReferenceID() == expectedRecordPtr->myReferenceID);
    assert(strcmp(samRecord.getReferenceName(), expectedReferenceName) == 0);
    assert(samRecord.get1BasedPosition() == expectedRecordPtr->myPosition + 1);
    assert(samRecord.get0BasedPosition() == expectedRecordPtr->myPosition);
    assert(samRecord.getReadNameLength() == 
           expectedRecordPtr->myReadNameLength);
    assert(samRecord.getMapQuality() == expectedRecordPtr->myMapQuality);
    assert(samRecord.getBin() == expectedRecordPtr->myBin);
    assert(samRecord.getCigarLength() == expectedRecordPtr->myCigarLength);
    assert(samRecord.getFlag() == expectedRecordPtr->myFlag);
    assert(samRecord.getReadLength() == expectedRecordPtr->myReadLength);
    assert(samRecord.getMateReferenceID() ==
           expectedRecordPtr->myMateReferenceID);
    assert(strcmp(samRecord.getMateReferenceName(), 
                  expectedMateReferenceName) == 0);
    assert(strcmp(samRecord.getMateReferenceNameOrEqual(), 
                  expectedMateReferenceNameOrEqual) == 0);
    assert(samRecord.get1BasedMatePosition() == 
           expectedRecordPtr->myMatePosition + 1);
    assert(samRecord.get0BasedMatePosition() ==
           expectedRecordPtr->myMatePosition);
    assert(samRecord.getInsertSize() == expectedRecordPtr->myInsertSize);
    assert(strcmp(samRecord.getReadName(), "1:1011:F:255+17M15D20M") == 0);
    assert(strcmp(samRecord.getCigar(), "5M2D") == 0);
    assert(strcmp(samRecord.getSequence(), "CCGAA") == 0);
    assert(strcmp(samRecord.getQuality(), "ABCDE") == 0);
    assert(samRecord.getNumOverlaps(1010, 1017) == 5);
    assert(samRecord.getNumOverlaps(1010, 1016) == 5);
    assert(samRecord.getNumOverlaps(1012, 1017) == 3);
    assert(samRecord.getNumOverlaps(1015, 1017) == 0);
    assert(samRecord.getNumOverlaps(1017, 1010) == 0);
    assert(samRecord.getNumOverlaps(1013, 1011) == 0);
    assert(samRecord.getNumOverlaps(-1, 1017) == 5);

    // Reset the tag iter, since the tags have already been read.
    samRecord.resetTagIter();

    // Check the tags.
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'A');
    assert(tag[1] == 'M');
    assert(type == 'i');
    assert(*(char*)value == 0);
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'M');
    assert(tag[1] == 'D');
    assert(type == 'Z');
    assert(*(String*)value == "37");
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'N');
    assert(tag[1] == 'M');
    assert(type == 'i');
    assert(*(char*)value == 0);
    assert(samRecord.getNextSamTag(tag, type, &value) == true);
    assert(tag[0] == 'X');
    assert(tag[1] == 'T');
    assert(type == 'A');
    assert(*(char*)value == 'R');
    // No more tags, should return false.
    assert(samRecord.getNextSamTag(tag, type, &value) == false);
    assert(samRecord.getNextSamTag(tag, type, &value) == false);

    // Get the record ptr.   
    bufferPtr = (bamRecordStruct*)samRecord.getRecordBuffer();
    // Validate the buffers match.
    assert(bufferPtr->myBlockSize == expectedRecordPtr->myBlockSize);
    assert(bufferPtr->myReferenceID == expectedRecordPtr->myReferenceID);
    assert(bufferPtr->myPosition == expectedRecordPtr->myPosition);
    assert(bufferPtr->myReadNameLength == expectedRecordPtr->myReadNameLength);
    assert(bufferPtr->myMapQuality == expectedRecordPtr->myMapQuality);
    assert(bufferPtr->myBin == expectedRecordPtr->myBin);
    assert(bufferPtr->myCigarLength == expectedRecordPtr->myCigarLength);
    assert(bufferPtr->myFlag == expectedRecordPtr->myFlag);
    assert(bufferPtr->myReadLength == expectedRecordPtr->myReadLength);
    assert(bufferPtr->myMateReferenceID ==
           expectedRecordPtr->myMateReferenceID);
    assert(bufferPtr->myMatePosition == expectedRecordPtr->myMatePosition);
    assert(bufferPtr->myInsertSize == expectedRecordPtr->myInsertSize);

    // Validate the variable length fields in the buffer.
    // Set the pointer to the start of the variable fields.
    varPtr = (unsigned char*)(&(bufferPtr->myData[0]));

    // Validate the readname.
    for(int i = 0; i < expectedRecordPtr->myReadNameLength; i++)
    {
        assert(*varPtr == samRecord.getReadName()[i]);
        varPtr++;
    }

    // Validate the cigar.
    // The First cigar is 5M which is 5 << 4 | 0 = 80
    assert(*(unsigned int*)varPtr == 80);
    // Increment the varptr the size of an int.
    varPtr += 4;
    // The 2nd cigar is 2D which is 2 << 4 | 2 = 34
    assert(*(unsigned int*)varPtr == 34);
    // Increment the varptr the size of an int.
    varPtr += 4;
   
    // Validate the sequence.
    // CC = 0x22
    assert(*varPtr == 0x22);
    varPtr++;
    // GA = 0x41
    assert(*varPtr == 0x41);
    varPtr++;
    // A  = 0x10
    assert(*varPtr == 0x10);
    varPtr++;
  
    // Validate the Quality
    for(int i = 0; i < expectedRecordPtr->myReadLength; i++)
    {
        assert(*varPtr == samRecord.getQuality()[i] - 33);
        varPtr++;
    }

    // Validate the tags.  
    assert(*varPtr == 'A');
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'C');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'D');
    varPtr++;
    assert(*varPtr == 'Z');
    varPtr++;
    assert(*varPtr == '3');
    varPtr++;
    assert(*varPtr == '7');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'N');
    varPtr++;
    assert(*varPtr == 'M');
    varPtr++;
    assert(*varPtr == 'C');
    varPtr++;
    assert(*varPtr == 0);
    varPtr++;
    assert(*varPtr == 'X');
    varPtr++;
    assert(*varPtr == 'T');
    varPtr++;
    assert(*varPtr == 'A');
    varPtr++;
    assert(*varPtr == 'R');
    varPtr++;
}
Ejemplo n.º 26
0
int Dedup_LowMem::execute(int argc, char** argv)
{
    /* --------------------------------
     * process the arguments
     * -------------------------------*/
    String inFile, outFile, logFile;
    myDoRecab = false;
    bool removeFlag = false;
    bool verboseFlag = false;
    myForceFlag = false;
    myNumMissingMate = 0;
    myMinQual = DEFAULT_MIN_QUAL;
    String excludeFlags = "0xB04";
    uint16_t intExcludeFlags = 0;
    bool noeof = false;
    bool params = false;

    LongParamContainer parameters;
    parameters.addGroup("Required Parameters");
    parameters.addString("in", &inFile);
    parameters.addString("out", &outFile);
    parameters.addGroup("Optional Parameters");
    parameters.addInt("minQual", & myMinQual);
    parameters.addString("log", &logFile);
    parameters.addBool("oneChrom", &myOneChrom);
    parameters.addBool("recab", &myDoRecab);
    parameters.addBool("rmDups", &removeFlag);
    parameters.addBool("force", &myForceFlag);
    parameters.addString("excludeFlags", &excludeFlags);
    parameters.addBool("verbose", &verboseFlag);
    parameters.addBool("noeof", &noeof);
    parameters.addBool("params", &params);
    parameters.addPhoneHome(VERSION);
    myRecab.addRecabSpecificParameters(parameters);

    ParameterList inputParameters;
    inputParameters.Add(new LongParameters ("Input Parameters",
                                            parameters.getLongParameterList()));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    if(inFile.IsEmpty())
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Specify an input file" << std::endl;
        return EXIT_FAILURE;
    }

    if(outFile.IsEmpty())
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Specify an output file" << std::endl;
        return EXIT_FAILURE;
    }

    intExcludeFlags = excludeFlags.AsInteger();

    if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Cannot specify --force and Duplicate in the excludeFlags.  Since --force indicates to override"
                  << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n";
        return EXIT_FAILURE;
    }

    if(!SamFlag::isSecondary(intExcludeFlags))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n";
        return EXIT_FAILURE;
    }

    if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n";
        return EXIT_FAILURE;
    }

    if(logFile.IsEmpty())
    {
        logFile = outFile + ".log";
    }

    if(myDoRecab)
    {
        int status = myRecab.processRecabParam();
        if(status != 0)
        {
            inputParameters.Status();
            return(status);
        }
    }

    if(params)
    {
        inputParameters.Status();
    }

    Logger::gLogger = new Logger(logFile.c_str(), verboseFlag);

    /* -------------------------------------------------------------------
     * The arguments are processed.  Prepare the input BAM file,
     * instantiate dedup_LowMem, and construct the read group library map
     * ------------------------------------------------------------------*/

    SamFile samIn;

    samIn.OpenForRead(inFile.c_str());
    // If the file isn't sorted it will throw an exception.
    samIn.setSortedValidation(SamFile::COORDINATE);

    SamFileHeader header;
    samIn.ReadHeader(header);

    buildReadGroupLibraryMap(header);

    lastReference = -1;
    lastCoordinate = -1;

    // for keeping some basic statistics
    uint32_t recordCount = 0;
    uint32_t pairedCount = 0;
    uint32_t properPairCount = 0;
    uint32_t unmappedCount = 0;
    uint32_t reverseCount = 0;
    uint32_t qualCheckFailCount = 0;
    uint32_t secondaryCount = 0;
    uint32_t supplementaryCount = 0;
    uint32_t excludedCount = 0;

    // Now we start reading records
    SamRecord* recordPtr;
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
    while(returnStatus == SamStatus::SUCCESS)
    {
        recordPtr = mySamPool.getRecord();
        if(recordPtr == NULL)
        {
            std::cerr << "Failed to allocate enough records\n";
            return(-1);
        }
        if(!samIn.ReadRecord(header, *recordPtr))
        {
            returnStatus = samIn.GetStatus();
            continue;
        }
        // Take note of properties of this record
        int flag = recordPtr->getFlag();
        if(SamFlag::isPaired(flag))     ++pairedCount;
        if(SamFlag::isProperPair(flag)) ++properPairCount;
        if(SamFlag::isReverse(flag))    ++reverseCount;
        if(SamFlag::isQCFailure(flag))  ++qualCheckFailCount;
        if(SamFlag::isSecondary(flag))  ++secondaryCount;
        if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT)  ++supplementaryCount;
        if(!SamFlag::isMapped(flag))    ++unmappedCount;

        // put the record in the appropriate maps:
        //   single reads go in myFragmentMap
        //   paired reads go in myPairedMap
        recordCount = samIn.GetCurrentRecordCount();

        // if we have moved to a new position, look back at previous reads for duplicates
        if (hasPositionChanged(*recordPtr))
        {
            cleanupPriorReads(recordPtr);
        }

        // Determine if this read should be checked for duplicates.
        if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0))
        {
            ++excludedCount;

            // No deduping done on this record, but still build the recab table.
            if(myDoRecab)
            {
                myRecab.processReadBuildTable(*recordPtr);
            }
            // Nothing more to do with this record, so
            // release the pointer.
            mySamPool.releaseRecord(recordPtr);
        }
        else
        {
            if(SamFlag::isDuplicate(flag) && !myForceFlag)
            {
                // Error: Marked duplicates, and duplicates aren't excluded.
                Logger::gLogger->error("There are records already duplicate marked.");
                Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over");
            }

            checkDups(*recordPtr, recordCount);
            mySamPool.releaseRecord(recordPtr);
        }
        // let the user know we're not napping
        if (verboseFlag && (recordCount % 100000 == 0))
        {
            Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u",
                                      recordCount, myFragmentMap.size(),
                                      myPairedMap.size(),
                                      myMateMap.size());
        }
    }

    // we're finished reading record so clean up the duplicate search and
    //  close the input file
    cleanupPriorReads(NULL);
    samIn.Close();

    // print some statistics
    Logger::gLogger->writeLog("--------------------------------------------------------------------------");
    Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS");
    Logger::gLogger->writeLog("Total number of reads: %u",recordCount);
    Logger::gLogger->writeLog("Total number of paired-end reads: %u",
                              pairedCount);
    Logger::gLogger->writeLog("Total number of properly paired reads: %u",
                              properPairCount);
    Logger::gLogger->writeLog("Total number of unmapped reads: %u",
                              unmappedCount);
    Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u",
                              reverseCount);
    Logger::gLogger->writeLog("Total number of QC-failed reads: %u",
                              qualCheckFailCount);
    Logger::gLogger->writeLog("Total number of secondary reads: %u",
                              secondaryCount);
    Logger::gLogger->writeLog("Total number of supplementary reads: %u",
                              supplementaryCount);
    Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u",
                              myFragmentMap.size());
    Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u",
                              myPairedMap.size());
    Logger::gLogger->writeLog("Total number of missing mates: %u",
                              myNumMissingMate);
    Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u",
                              excludedCount);
    Logger::gLogger->writeLog("--------------------------------------------------------------------------");
    Logger::gLogger->writeLog("Sorting the indices of %d duplicated records",
                              myDupList.size());

    // sort the indices of duplicate records
    std::sort(myDupList.begin(), myDupList.end(),
              std::less<uint32_t> ());

    // get ready to write the output file by making a second pass
    // through the input file
    samIn.OpenForRead(inFile.c_str());
    samIn.ReadHeader(header);

    SamFile samOut;
    samOut.OpenForWrite(outFile.c_str());
    samOut.WriteHeader(header);

    // If we are recalibrating, output the model information.
    if(myDoRecab)
    {
        myRecab.modelFitPrediction(outFile);
    }

    // an iterator to run through the duplicate indices
    int currentDupIndex = 0;
    bool moreDups = !myDupList.empty();

    // let the user know what we're doing
    Logger::gLogger->writeLog("\nWriting %s", outFile.c_str());

    // count the duplicate records as a check
    uint32_t singleDuplicates(0), pairedDuplicates(0);

    // start reading records and writing them out
    SamRecord record;
    while(samIn.ReadRecord(header, record))
    {
        uint32_t currentIndex = samIn.GetCurrentRecordCount();

        bool foundDup = moreDups &&
                        (currentIndex == myDupList[currentDupIndex]);

        // modify the duplicate flag and write out the record,
        // if it's appropriate
        int flag = record.getFlag();
        if (foundDup)
        {
            // this record is a duplicate, so mark it.
            record.setFlag( flag | 0x400 );
            currentDupIndex++;
            // increment duplicate counters to verify we found them all
            if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) )
            {   // unpaired or mate unmapped
                singleDuplicates++;
            }
            else
            {
                pairedDuplicates++;
            }
            // recalibrate if necessary.
            if(myDoRecab)
            {
                myRecab.processReadApplyTable(record);
            }

            // write the record if we are not removing duplicates
            if (!removeFlag ) samOut.WriteRecord(header, record);
        }
        else
        {
            if(myForceFlag)
            {
                // this is not a duplicate we've identified but we want to
                // remove any duplicate marking
                record.setFlag( flag & 0xfffffbff ); // unmark duplicate
            }
            // Not a duplicate, so recalibrate if necessary.
            if(myDoRecab)
            {
                myRecab.processReadApplyTable(record);
            }
            samOut.WriteRecord(header, record);
        }

        // Let the user know we're still here
        if (verboseFlag && (currentIndex % 100000 == 0)) {
            Logger::gLogger->writeLog("recordCount=%u", currentIndex);
        }
    }

    // We're done.  Close the files and print triumphant messages.
    samIn.Close();
    samOut.Close();

    Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads",
                              removeFlag ? "removed" : "marked" ,
                              singleDuplicates,
                              pairedDuplicates/2);
    Logger::gLogger->writeLog("\nDedup_LowMem complete!");
    return 0;
}
Ejemplo n.º 27
0
bool BamProcessor::processRecord ()
{
    trclog << "\nProcessing record " << read_cnt_ << " - " << rec_.getReadName () << ", " << rec_.get0BasedUnclippedEnd () << "->" << rec_.getReadLength () << ", ref " << rec_.getReferenceName () << std::endl;
    const char* seq = rec_.getSequence ();
    unsigned position = rec_.get0BasedPosition ();
    unsigned new_position = position;
    bool reverse_match = (rec_.getFlag () & 0x10);

    Cigar* cigar_p = rec_.getCigarInfo ();
    if (!cigar_p->size ())  // can not recreate reference is cigar is missing. Keep record unaligned.
    {                       // TODO: allow to specify and load external reference
        ++ unaligned_cnt_;
        return true;
    }

    myassert (cigar_p);

    const String *mdval = rec_.getStringTag ("MD");
    if (!mdval) // can not recreate reference is MD tag is missing. Keep record as is.
    {
        warn << "No MD Tag for record " << proc_cnt_ << ". Skipping record." << std::endl;
        ++nomd_cnt_;
        return true; // record will be kept as-is.
    }
    std::string md_tag = mdval->c_str ();

    // find the non-clipped region
    uint32_t clean_len;
    EndClips clips;
    const char* clean_read = clip_seq (seq, *cigar_p, clean_len, clips);

    // find length needed for the reference
    // this reserves space enough for entire refference, including softclipped ends.
    unsigned ref_len = cigar_p->getExpectedReferenceBaseCount ();
    if (ref_buffer_sz_ < ref_len)
    {
        ref_buffer_sz_ = (1 + ref_len / REF_BUF_INCR) * REF_BUF_INCR;
        ref_buffer_.reset (ref_buffer_sz_);
    }
    if (clean_len > MAX_SEQ_LEN || ref_len > MAX_SEQ_LEN)
    {
        ++ toolongs_;
        return true;
    }

    // recreate reference by Query, Cigar, and MD tag. Do not include softclipped ends in the recreated sequence (use default last parameter)
    recreate_ref (seq, rec_.getReadLength (), cigar_p, md_tag.c_str (), ref_buffer_, ref_buffer_sz_);

    unsigned qry_ins; // extra bases in query     == width_left
    unsigned ref_ins; // extra bases in reference == width_right
    band_width (*cigar_p, qry_ins, ref_ins);

    if (log_matr_ || log_base_)
    {
        logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << "\n"
                 << "   sequence (" << rec_.getReadLength () << " bases)\n";
    }

    CigarRoller roller;
    int ref_shift = 0;  // shift of the new alignment position on refereance relative the original
    unsigned qry_off, ref_off; // offsets on the query and reference of the first non-clipped aligned bases
    double new_score = 0;

    switch (p_->algo ())
    {
        case ContalignParams::TEMPL:
        {
            // call aligner
            new_score = taligner_.eval (clean_read, clean_len, ref_buffer_, ref_len, 0, band_width_);
            // read traceback
            // TODO: convert directly to cigar
            genstr::Alignment* al = taligner_.trace ();
            // convert alignment to cigar
            ref_shift = roll_cigar (roller, *al, clean_len, clips, qry_off, ref_off);
        }
        break;
        case ContalignParams::PLAIN:
        {
            new_score = aligner_.align_band (
                clean_read,                     // xseq
                clean_len,                      // xlen
                ref_buffer_,                    // yseq
                ref_len,                        // ylen
                0,                              // xpos
                0,                              // ypos
                std::max (clean_len, ref_len),  // segment length
                qry_ins + band_width_,          // width_left
                false,                          // unpack
                ref_ins + band_width_,          // width_right - forces to width_left
                true,                           // to_beg
                true                            // to_end
                );
            unsigned bno = aligner_.backtrace (
                    batches_,      // BATCH buffer
                    max_batch_no_, // size of BATCH buffer
                    false,         // fill the BATCH array in reverse direction
                    ref_ins + band_width_ // width
                                    );
            // convert alignment to cigar
            ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off);
        }
        break;
        case ContalignParams::POLY:
        {
            new_score = contalign_.align_band (
                clean_read,                     // xseq
                clean_len,                      // xlen
                ref_buffer_,                    // yseq
                ref_len,                        // ylen
                0,                              // xpos
                0,                              // ypos
                std::max (clean_len, ref_len),  // segment length
                qry_ins + band_width_,          // width_left
                false,                          // unpack
                ref_ins + band_width_,          // width_right - forces to width_left
                true,                           // to_beg
                true                            // to_end
                );
            unsigned bno = contalign_.backtrace (
                    batches_,      // BATCH buffer
                    max_batch_no_, // size of BATCH buffer
                    false,         // fill the BATCH array in reverse direction
                    ref_ins + band_width_ // width
                                    );
            // convert alignment to cigar
            ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off);
        }
        break;
        default:
        break;
    }
    ++realigned_cnt_;
    // compare original and new cigar (and location)
    if (ref_shift || !(*cigar_p == roller))
    {
        // save original cigar and position for reporting
        std::string orig_cigar_str;
        rec_.getCigarInfo ()->getCigarString (orig_cigar_str);
        int32_t prior_pos = rec_.get0BasedPosition ();

        // replace cigar
        rec_.setCigar (roller);
        ++ modified_cnt_;
        // update pos_adjusted_cnt if position changed
        if (ref_shift != 0)
        {
            myassert (prior_pos + ref_shift >= 0);
            rec_.set0BasedPosition (prior_pos + ref_shift);
            ++ pos_adjusted_cnt_;
        }
        if (log_diff_)
        {
            const unsigned MAX_BATCH_PRINTED = 100;
            BATCH batches [MAX_BATCH_PRINTED];
            std::string new_cigar_str;
            unsigned bno;
            int swscore;

            rec_.getCigarInfo ()->getCigarString (new_cigar_str);
            if (!log_base_ && !log_matr_)
                logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << " (" << rec_.getReadLength () << " bases)\n";

            logfile_ << "   ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" <<  orig_cigar_str << "\n";
            bno = cigar_to_batches (orig_cigar_str, batches, MAX_BATCH_PRINTED);
            swscore = align_score (batches, bno, clean_read, ref_buffer_, p_->gip (), p_->gep (), p_->mat (), p_->mis ());
            print_batches (clean_read, clean_len, false, ref_buffer_, ref_len, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_, clips.soft_beg_, 0, 160);
            logfile_ << "\n     'classic' SW score is " << swscore << "\n";

            logfile_ << "   NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" <<  new_cigar_str << std::endl;
            bno = cigar_to_batches (new_cigar_str, batches, MAX_BATCH_PRINTED);
            swscore = align_score (batches, bno, clean_read + qry_off, ref_buffer_ + ref_off, p_->gip (), p_->gep (), p_->mat (), p_->mis ());
            print_batches (clean_read + qry_off, clean_len - qry_off, false, ref_buffer_ + ref_off, ref_len - ref_off, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_ + ref_off, clips.soft_beg_ + qry_off, 0, 160);
            logfile_ << "\n      'classic' SW score is " << swscore;
            logfile_ << "\n      alternate (context-aware) score is " << new_score << ", used bandwidth left: " << qry_ins + band_width_ << ", right: " << ref_ins + band_width_ << "\n" << std::endl;
        }
        else if (log_base_)
        {
            logfile_ << "Recomputed alignment differs from original:\n";
            logfile_ << "   ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" <<  orig_cigar_str << "\n";
            std::string new_cigar_str;
            rec_.getCigarInfo ()->getCigarString (new_cigar_str);
            logfile_ << "    NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" <<  new_cigar_str << "\n" << std::endl;
        }
    }
    else
    {
        if (log_base_)
        {
            logfile_ << "Recomputed alignment matches the original:\n";
            std::string orig_cigar_str;
            rec_.getCigarInfo ()->getCigarString (orig_cigar_str);
            int32_t prior_pos = rec_.get0BasedPosition ();
            logfile_ << "   " << std::right << std::setw (9) << prior_pos+1 << "->" <<  orig_cigar_str << "\n" << std::endl;
        }
    }
    return true;
}