示例#1
0
void MakeVirtualRead(SMRTSequence & smrtRead,
                     const vector<SMRTSequence> & subreads)
{
    assert(subreads.size() > 0);
    DNALength hqStart = 0, hqEnd = 0;
    for(auto subread: subreads) {
        hqStart = min(DNALength(subread.SubreadStart()), hqStart);
        hqEnd   = max(DNALength(subread.SubreadEnd()),   hqEnd);
    }
    smrtRead.Free();
    smrtRead.Allocate(hqEnd);
    memset(smrtRead.seq, 'N', sizeof(char) * hqEnd);
    smrtRead.lowQualityPrefix = hqStart;
    smrtRead.lowQualitySuffix = smrtRead.length - hqEnd;
    smrtRead.highQualityRegionScore = subreads[0].highQualityRegionScore;
    smrtRead.HoleNumber(subreads[0].HoleNumber());
    stringstream ss;
    ss << SMRTTitle(subreads[0].GetTitle()).MovieName() << "/" << subreads[0].HoleNumber();
    smrtRead.CopyTitle(ss.str());
    for (auto subread: subreads) {
        memcpy(&smrtRead.seq[subread.SubreadStart()],
               &subread.seq[0], sizeof(char) * subread.length);
    }
}
void SAMAlignmentsToCandidates(SAMAlignment &sam,
                               std::vector<FASTASequence> &referenceSequences,
                               std::map<std::string,int> & refNameToRefListIndex,
                               std::vector<AlignmentCandidate<> > &candidates, 
                               bool parseSmrtTitle,
                               bool keepRefAsForward,
                               bool copyQVs) {
  //
  // First determine how many alignments there are from CIGAR string.
  //
  std::vector<int> lengths;
  std::vector<char> ops;
  sam.cigar.Vectorize(lengths, ops);

  DNASequence querySeq;
  // For now just reference the query sequence.
  querySeq.deleteOnExit = false;
  querySeq.seq = (Nucleotide*) sam.seq.c_str();
  querySeq.length = sam.seq.size();

  DNALength samTEnd = 0;
  DNALength samTStart = sam.pos - 1;

  std::vector<std::string> optionalQVs;
  if (copyQVs) {
      sam.CopyQVs(&optionalQVs);    
  }
  if (keepRefAsForward == false and IsReverseComplement(sam.flag)) {
    ReverseAlignmentOperations(lengths, ops);
    DNASequence rcQuerySeq;
    querySeq.CopyAsRC(rcQuerySeq);
    //
    // Zero out the query seq so that the string memory is not
    // deleted.
    //
    querySeq.seq = NULL;
    querySeq.length = 0;
    querySeq = rcQuerySeq;
    rcQuerySeq.Free();
    samTEnd = GetAlignedReferenceLengthByCIGARSum(ops, lengths);
    
    // We also need to reverse any optional QVs
    if (copyQVs) {
      for(int i=0; i<optionalQVs.size(); i++) {
        std::reverse(optionalQVs[i].begin(), optionalQVs[i].end());
      }
    }
  }


  int i;
  int offset = 0;
  if (ops.size() == 0) {
    return;
  }
  bool alignmentStarted = false;
  bool onFirstMatch = true;
  int  curAlignment;
  
  //
  // Advance past any clipping.  This advances in both query and
  // reference position.
  //
  int cigarPos = 0;
  int qPos = 0; 
  int tPos = 0;

  DNALength queryPosOffset = 0;
  if (parseSmrtTitle) {
    //
    // The aligned sequence is really a subread of a full
    // sequence. The position of the aligments start at 0, the
    // beginning of the query sequence, but in the sam file, they
    // may appear as subreads, and are offset from the start of the
    // subread.  By convention, the subread coordinates are embedded
    // in the title of the query, if it is a smrtTitle. 
    // Two types of smrtTitle are supported:
    // movie/zmw/start_end
    // movie/zmw/start_end/start2_end2
    SMRTTitle stitle = SMRTTitle(sam.qName);

    if (not stitle.isSMRTTitle) {
      std::cout << "ERROR. Could not parse title " << sam.qName << std::endl;
      exit(1);
    }
    queryPosOffset = stitle.start;
  }
  else if (sam.xs) {
    queryPosOffset += sam.xs - 1;
  }


  while (cigarPos < lengths.size()) {
    int numClipped;
    //
    // Sequence clipping becomes offsets into the q/t alignedSeqPos
    //


    int numSoftClipped;
    numClipped = AdvancePastClipping(lengths, ops, cigarPos, numSoftClipped);

    //
    // End loop now.
    //
    if (cigarPos >= lengths.size()) {
      break;
    }
    qPos += numSoftClipped;

    //
    // Skipped sequences are just advances in the tPos.
    //
    int numSkipped = AdvancePastSkipped(lengths, ops, cigarPos);
    tPos += numSkipped;

    if (cigarPos >= lengths.size()) {
      break;
    }


    AlignmentCandidate<> alignment;
    //
    // The aligned sequence must start at a match therefore the tpos
    // and qpos are 0.
    //
    alignment.qPos = 0;
    alignment.tPos = 0;

    // qAlignStart is the start of the alignment relative to the sequence in the SAM file.
    DNALength qAlignStart = qPos;
    // tAlignStart is the start of the alignment in the genome.
    DNALength tAlignStart = tPos;
    
    int cigarEnd = cigarPos;
    AdvancePosToAlignmentEnd(ops, cigarEnd);

    CIGAROpsToBlocks(lengths, ops,          
                     cigarPos, cigarEnd,
                     qPos, tPos,
                     alignment);


    DNALength queryLengthSum = GetAlignedQueryLengthByCIGARSum(ops, lengths);
    DNALength refLengthSum   = GetAlignedReferenceLengthByCIGARSum(ops, lengths);
    alignment.qAlignedSeqLength = qPos - qAlignStart;
    alignment.tAlignedSeqLength = tPos - tAlignStart;

    //
    // Assign candidate sequences.
    //
    // First, the query sequence is straight from the SAM line.
    ((DNASequence*)&alignment.qAlignedSeq)->Copy(querySeq, qAlignStart, alignment.qAlignedSeqLength);
    if (copyQVs) {
      alignment.ReadOptionalQVs(optionalQVs, qAlignStart, alignment.qAlignedSeqLength);
    }
    
    // The SAM Alignments a
    alignment.qStrand = IsReverseComplement(sam.flag);
    alignment.tStrand = 0;
    alignment.mapQV   = sam.mapQV;

    //
    // Assign the offsets into the original sequence where the
    // subsequence starts.
    //

    alignment.qAlignedSeqPos = queryPosOffset + qAlignStart;    
    alignment.tAlignedSeqPos = samTStart + tAlignStart;
    
    if (sam.rName == "*") {
      //
      // No reference, do not add the alignment to the list of
      // candidates.
      //
      continue;
    }
    else {
      int refIndex;
      int s = refNameToRefListIndex.size();
      if (refNameToRefListIndex.find(sam.rName) == refNameToRefListIndex.end()) {
        std::cout <<" ERROR.  SAM Reference " << sam.rName << " is not found in the list of reference contigs." << std::endl;
        exit(1);
      }
      
      refIndex = refNameToRefListIndex[sam.rName];
     
      alignment.tLength = referenceSequences[refIndex].length;
      alignment.qLength = sam.seq.size(); 
      alignment.qName = sam.qName;
      alignment.tName = sam.rName;


      if (keepRefAsForward == false and alignment.qStrand == 1) {

        //
        // Now that the reference sequence has been copied, if it is
        // on the reverse strand, make the reverse complement for
        // proper printing.
        //
        alignment.tAlignedSeqPos = samTStart + (samTEnd - tAlignStart - alignment.tAlignedSeqLength);
		if (alignment.tAlignedSeqLength > referenceSequences[refIndex].length ||
			alignment.tAlignedSeqPos    > referenceSequences[refIndex].length ||
			alignment.tAlignedSeqLength + alignment.tAlignedSeqPos > referenceSequences[refIndex].length + 2) {
            //alignment.tAlignedSeqPos is 1 based and unsigned.
			std::cout << "WARNING. The mapping of read " << alignment.qName  
				 << " to reference "      << alignment.tName 
                 << " is out of bounds."  << std::endl
                 << "         StartPos (" << alignment.tAlignedSeqPos  
                 << ") + AlnLength (" << alignment.tAlignedSeqLength 
                 << ") > RefLength (" << referenceSequences[refIndex].length
                 << ") + 2 "          << std::endl;
            continue;
		}
        ((DNASequence*)&alignment.tAlignedSeq)->Copy(referenceSequences[refIndex], alignment.tAlignedSeqPos, alignment.tAlignedSeqLength);             
        alignment.tAlignedSeq.ReverseComplementSelf();
        // either ref or read is defined as being in the forward
        // orientation.  Here, since refAsForward is false, the read
        // is forward.  Since the read is forward, the aligned
        // sequences are stored as the reverse complement of the read
        // and the references.
        //
        alignment.tStrand = 1;
        alignment.qStrand = 0;
      }
      else {
        if (alignment.tAlignedSeqLength > referenceSequences[refIndex].length ||
			alignment.tAlignedSeqPos    > referenceSequences[refIndex].length ||
			alignment.tAlignedSeqLength + alignment.tAlignedSeqPos > referenceSequences[refIndex].length + 2) {
            //alignment.tAlignedSeqPos is 1 based and unsigned. 
			std::cout << "WARNING. The mapping of read " << alignment.qName  
				 << " to reference "      << alignment.tName 
                 << " is out of bounds."  << std::endl
                 << "         StartPos (" << alignment.tAlignedSeqPos  
                 << ") + AlnLength (" << alignment.tAlignedSeqLength 
                 << ") > RefLength (" << referenceSequences[refIndex].length
                 << ") + 2 "          << std::endl;
            continue;
		}
        ((DNASequence*)&alignment.tAlignedSeq)->Copy(referenceSequences[refIndex], 
                                                     alignment.tAlignedSeqPos, 
                                                     alignment.tAlignedSeqLength);
      }
    }

    if (alignment.blocks.size() > 0) {
      candidates.push_back(alignment);
    }
  }
  if (candidates.size() > 0 and keepRefAsForward == false and candidates[0].tStrand == 1) {
    std::reverse(candidates.begin(), candidates.end());
  }
  querySeq.Free();
}