void AlignmentSetToCmpH5Adapter<T_CmpFile>::StoreAlignmentCandidate(
    AlignmentCandidate<> &alignment, 
    int alnSegment,
    T_CmpFile &cmpFile,
    int moleculeNumber,
    bool copyQVs) {
  //
  // Find out where the movie is going to get stored.
  //
  std::string movieName;
  int holeNumber = 0;
  bool nameParsedProperly;
  
  nameParsedProperly = ParsePBIReadName(alignment.qName, movieName, holeNumber);
  if (!nameParsedProperly) {
    std::cout <<"ERROR. Attempting to store a read with name " 
          << alignment.qName << " that does not " << std::endl
          << "appear to be a PacBio read." << std::endl;
    exit(1);
  }

  unsigned int movieId = StoreMovieInfo(movieName, cmpFile);

  // Check whether the reference is in /RefInfo.
  std::map<std::string, int>::iterator mapIt;
  mapIt = refNameToRefInfoIndex.find(alignment.tName);
  if (mapIt == refNameToRefInfoIndex.end()) {
    std::cout << "ERROR. The reference name " << alignment.tName 
          << " was not found in the list of references." << std::endl;
    std::cout << "Perhaps a different reference file was aligned to than " << std::endl
          << "what was provided for SAM conversion. " << std::endl;
    exit(1);
  } 

  // Store refGroup
  unsigned int refGroupId = StoreRefGroup(alignment.tName, cmpFile);
  std::string refGroupName = refNameToRefGroupNameandId[alignment.tName].name; 
  assert(refGroupId  == refNameToRefGroupNameandId[alignment.tName].id);

  if (cmpFile.refGroupIdToArrayIndex.find(refGroupId) == cmpFile.refGroupIdToArrayIndex.end()) {
    std::cout << "ERROR. The reference ID is not indexed. " 
          << "This is an internal inconsistency." << std::endl;
    exit(1);
  }

  size_t refGroupIndex= cmpFile.refGroupIdToArrayIndex[refGroupId];
  assert(refGroupIndex + 1 == refGroupId);

  std::string path = "/" + refGroupName + "/" + movieName;
  unsigned int pathId = StorePath(path, cmpFile);

  vector<unsigned int> alnIndex;
  alnIndex.resize(22);

  RemoveGapsAtEndOfAlignment(alignment);

  /*
    * Store the alignment string
    */
  vector<unsigned char> byteAlignment;
  AlignmentToByteAlignment(alignment, 
                            alignment.qAlignedSeq, alignment.tAlignedSeq,
                            byteAlignment);

  unsigned int offsetBegin, offsetEnd;
  cmpFile.StoreAlnArray(byteAlignment, alignment.tName, movieName, offsetBegin, offsetEnd);
  // Copy QVs into cmp.h5
  if (copyQVs) {
    std::vector<std::string> optionalQVs;
    alignment.CopyQVs(&optionalQVs);
    for (size_t qv_i=0; qv_i<optionalQVs.size(); qv_i++) {
      std::string *qvName = &alignment.optionalQVNames[qv_i];
      std::string *qvString = &optionalQVs[qv_i];
      
      // If the qvString is empty, then the alignment is missing the quality
      // value
      if (qvString->size() == 0) {
        continue;
      }

      unsigned int qvOffsetBegin, qvOffsetEnd;
      if (qvName->compare(qvName->size() - 3, 3, "Tag") == 0) {
        std::vector<char> qvVector;
        QVsToCmpH5QVs(*qvString, byteAlignment, true, &qvVector);
        cmpFile.StoreTags(qvVector, alignment.tName, *qvName,
                          movieName, &qvOffsetBegin, &qvOffsetEnd);
      } else {
        std::vector<UChar> qvVector;
        QVsToCmpH5QVs(*qvString, byteAlignment, false, &qvVector);
        cmpFile.StoreQVs(qvVector, alignment.tName, *qvName,
                         movieName, &qvOffsetBegin, &qvOffsetEnd);
      }
      assert(qvOffsetBegin == offsetBegin);
      assert(qvOffsetEnd == offsetEnd);
    }
  }

  numAlignments++;

  DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn;
  //distScoreFn does not matter since the score is not stored.
  ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distScoreFn);

  /*
    The current AlnIndex column names:
    (0): "AlnID", "AlnGroupID", "MovieID", "RefGroupID", "tStart",
    (5): "tEnd", "RCRefStrand", "HoleNumber", "SetNumber",
    (9): "StrobeNumber", "MoleculeID", "rStart", "rEnd", "MapQV", "nM",
    (15): "nMM", "nIns", "nDel", "Offset_begin", "Offset_end",
    (20): "nBackRead", "nReadOverlap"
  */
  if (moleculeNumber == -1) {
    moleculeNumber =  numZMWsPerMovieSpringField * (movieId - 1) + holeNumber;
  }
  alnIndex[0]  = numAlignments;  // AlnId
  alnIndex[1]  = pathId;        // AlnGroupID
  alnIndex[2]  = movieId;    // MovieID
  alnIndex[3]  = refGroupId; // RefGroupID
  alnIndex[4]  = alignment.tAlignedSeqPos; // tStart
  alnIndex[5]  = alignment.tAlignedSeqPos +  alignment.tAlignedSeqLength; // tEnd
  alnIndex[6]  = alignment.tStrand; // RCRefStrand
  alnIndex[7]  = holeNumber;
  alnIndex[8]  = 0; // SET NUMBER -- parse later!!!!
  alnIndex[9]  = alnSegment; // strobenumber
  alnIndex[10] = moleculeNumber;
  alnIndex[11] = alignment.qAlignedSeqPos; 
  alnIndex[12] = alignment.qAlignedSeqPos + alignment.qAlignedSeqLength;
  alnIndex[13] = alignment.mapQV;
  alnIndex[14] = alignment.nMatch;
  alnIndex[15] = alignment.nMismatch;
  alnIndex[16] = alignment.nIns;
  alnIndex[17] = alignment.nDel;
  alnIndex[18] = offsetBegin;
  alnIndex[19] = offsetEnd;
  alnIndex[20] = 0;
  alnIndex[21] = 0;
  cmpFile.alnInfoGroup.WriteAlnIndex(alnIndex);
}
Exemple #2
0
int main(int argc, char* argv[]) {
#ifdef USE_GOOGLE_PROFILER
    char *profileFileName = getenv("CPUPROFILE");
    if (profileFileName != NULL) {
      ProfilerStart(profileFileName);
    }
    else {
      ProfilerStart("google_profile.txt");
    }
#endif

    // Register inputs and outputs.
    string samFileName, refFileName, outFileName;

    CommandLineParser clp;
    clp.RegisterStringOption("file.sam", &samFileName,
                             "Input SAM file.");
    clp.RegisterStringOption("reference.fasta", &refFileName,
                             "Reference used to generate reads.");
    clp.RegisterStringOption("out.sam", &outFileName,
                             "Output SAM file.");
    clp.RegisterPreviousFlagsAsHidden();

    // Register filter criteria options.
    int minAlnLength = 50;
    float minPctSimilarity = 70, minPctAccuracy = 70;
    string hitPolicyStr = "randombest";
    bool useScoreCutoff = false;
    int  scoreCutoff = INF_INT;
    int  scoreSignInt = -1;
    RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, 
                          minPctAccuracy, hitPolicyStr, useScoreCutoff,
                          scoreSignInt, scoreCutoff);

    int seed = 1; 
    clp.RegisterIntOption("seed", &seed,
            "(1)  Seed for random number generator.\n"
            "If seed is 0, then use current time as seed.",
            CommandLineParser::Integer);

    string holeNumberStr;
    Ranges holeNumberRanges;
    clp.RegisterStringOption("holeNumbers", &holeNumberStr,
            "A string of comma-delimited hole number ranges to output hits, "
            "such as '1,2,10-12'. "
            "This requires hit titles to be in SMRT read title format.");

    bool parseSmrtTitle = false;
    clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle,
            "Use this option when filtering alignments generated by "
            "programs other than blasr, e.g. bwa-sw or gmap. "
            "  Parse read coordinates from the SMRT read title. " 
            "The title is in the format /name/hole/coordinates, where"
            " coordinates are in the format \\d+_\\d+, and represent "
            "the interval of the read that was aligned.");
    /* This experimental option can be useful for metagenomics, in which case
     * there are hundreds of sequences in the target, of which many titles are
     * long and may contain white spaces (e.g., ' ', '\t'). 
     * In order to save disc space and avoid the (possibly) none unique mapping
     * between full and short reference names, one may call blasr with 
     * -titleTable option to represent all target sequences in the output
     * by their indices in the title table.*/

    string titleTableName = "";
    clp.RegisterStringOption("titleTable", &titleTableName,
            "Use this experimental option when filtering alignments generated by "
            "blasr with -titleTable titleTableName, in which case "
            "reference titles in SAM are represented by their "
            "indices (e.g., 0, 1, 2, ...) in the title table.");

    string adapterGffFileName = "";
    clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName,
            "Use this option to remove reads which can only map to adapters " 
            "specified in the GFF file.");

    bool verbose = false;
    clp.RegisterFlagOption("v", &verbose, "Be verbose.");

    clp.SetExamples(
            "Because SAM has optional tags that have different meanings"
            " in different programs, careful usage is required in order "
            "to have proper output.  The \"xs\" tag in bwa-sw is used to "
            "show the suboptimal score, but in PacBio SAM (blasr) it is "
            "defined as the start in the query sequence of the alignment.\n"
            "When \"-smrtTitle\" is specified, the xs tag is ignored, but "
            "when it is not specified, the coordinates given by the xs and "
            "xe tags are used to define the interval of a read that is "
            "aligned.  The CIGAR string is relative to this interval.");

    clp.ParseCommandLine(argc, argv);

    // Set random number seed. 
    if (seed == 0) {
        srand(time(NULL));
    } else {
        srand(seed);
    }
    
    scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE;
    Score s(static_cast<float>(scoreCutoff), scoreSign);
    FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, 
                                  minPctAccuracy, true, s);
    filterCriteria.Verbose(verbose);
    HitPolicy hitPolicy(hitPolicyStr, scoreSign);
                                  
    string errMsg;
    if (not filterCriteria.MakeSane(errMsg)) {
        cout << errMsg << endl;
        exit(1);
    }

    // Parse hole number ranges. 
    if (holeNumberStr.size() != 0) {
        if (not holeNumberRanges.setRanges(holeNumberStr)) {
            cout << "Could not parse hole number ranges: "
                 << holeNumberStr << "." << endl;
            exit(1);
        } 
    }

    // Open output file.
    ostream * outFilePtr = &cout;
	ofstream outFileStrm;
	if (outFileName != "") {
		CrucialOpen(outFileName, outFileStrm, std::ios::out);
		outFilePtr = &outFileStrm;
	}
    
    GFFFile adapterGffFile;
    if (adapterGffFileName != "")
        adapterGffFile.ReadAll(adapterGffFileName);
    
    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader;
    FASTAReader fastaReader;

    //
    // Initialize samReader and fastaReader.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);

    //
    // Configure the file log.
    //
    string command;
    CommandLineParser::CommandLineToString(argc, argv, command);
    string log = "Filter sam hits.";
    string program = "samFilter";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

    //
    // Read necessary input.
    //
    vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    // If the SAM file is generated by blasr with -titleTable,
    // then references in the SAM are represented by 
    // their corresponding indices in the title table.
    // In that case, we need to convert reference titles in fasta file
    // to their corresponding indices in the title table, such that
    // references in both SAM and fasta files are represented
    // by title table indices and therefore can match.
    if (titleTableName != "") {
        ConvertTitlesToTitleTableIndices(references, titleTableName);
    }
 
    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet;
    vector<string> allHeaders = samReader.ReadHeader(alignmentSet); 

    // Process SAM Header.
    string commandLineString;
    clp.CommandLineToString(argc, argv, commandLineString);
    allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \
                         "\tCL:" + program + " " + commandLineString);
    for (int i = 0; i < allHeaders.size(); i++) {
        outFileStrm << allHeaders[i] << endl;
    }

    //
    // The order of references in vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that they are ordered in
    // exactly the same way as vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    // Map reference name obtained from SAM file to indices
    map<string, int> refNameToIndex;
    for (int i = 0; i < references.size(); i++) {
        string refName = alignmentSet.references[i].GetSequenceName();
        refNameToIndex[refName] = i;
    }

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    int alignIndex = 0; 

    //
    // For 150K, each chip produces about 300M sequences 
    // (not including quality values and etc.).
    // Let's assume that the sam file and reference data can 
    // fit in the memory. 
    // Need to scale for larger sequal data in the future.
    //
    vector<SAMAlignment> allSAMAlignments;
    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }

        if (parseSmrtTitle and holeNumberStr.size() != 0) {
            string movieName;
            int thisHoleNumber;
            if (not ParsePBIReadName(samAlignment.qName, 
                                     movieName, 
                                     thisHoleNumber)) {
                cout << "ERROR, could not parse SMRT title: "
                     << samAlignment.qName << "." << endl;
                exit(1);
            }
            if (not holeNumberRanges.contains(UInt(thisHoleNumber))) {
                if (verbose) 
                    cout << thisHoleNumber << " is not in range." << endl; 
                continue;
            }
        }

        if (samAlignment.cigar.find('P') != string::npos) {
            cout << "WARNING. Could not process SAM record with 'P' in "
                 << "its cigar string." << endl;
            continue;
        }

        vector<AlignmentCandidate<> > convertedAlignments;
        SAMAlignmentsToCandidates(samAlignment, 
                references, refNameToIndex,
                convertedAlignments, parseSmrtTitle, false);
        
        if (convertedAlignments.size() > 1) {
            cout << "WARNING. Ignore multiple segments." << endl;
            continue;
        }

        for (int i = 0; i < 1; i++) {
            AlignmentCandidate<> & alignment = convertedAlignments[i];

            //score func does not matter
            DistanceMatrixScoreFunction<DNASequence, DNASequence> distFunc; 
            ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, 
                                  alignment.tAlignedSeq.seq, distFunc);
                                  
            // Check whether this alignment can only map to adapters in 
            // the adapter GFF file.
            if (adapterGffFileName != "" and 
                CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) {
                if (verbose)
                    cout << alignment.qName << " filter adapter only."
                         << endl;
                continue;
            }

            // Assign score to samAlignment.
            samAlignment.score = samAlignment.as;

            if (not filterCriteria.Satisfy(static_cast<AlignmentCandidate<> *>(&alignment))) {
                continue;
            }
            allSAMAlignments.push_back( samAlignment ); 

            alignment.FreeSubsequences();
        }
        ++alignIndex;
    }

    // Sort all SAM alignments by qName, score and target position.
    sort(allSAMAlignments.begin(), allSAMAlignments.end(), 
         byQNameScoreTStart);

    unsigned int groupBegin = 0;
    unsigned int groupEnd = -1;
    vector<SAMAlignment> filteredSAMAlignments;
    while(groupBegin < allSAMAlignments.size()) {
        // Get the next group of SAM alignments which have the same qName
        // from allSAMAlignments[groupBegin ... groupEnd)
        GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd);
        vector<unsigned int> hitIndices = ApplyHitPolicy(
                hitPolicy, allSAMAlignments, groupBegin, groupEnd);
        for(unsigned int i = 0; i < hitIndices.size(); i++) {
            filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]);
        }
        groupBegin = groupEnd;
    }

    // Sort all SAM alignments by reference name and query name
    sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), 
         byRNameQName);

    for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) {
        filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm);
    }

	if (outFileName != "") {
		outFileStrm.close();
	}
#ifdef USE_GOOGLE_PROFILER
  ProfilerStop();
#endif
    return 0;
}