void AlignmentSetToCmpH5Adapter<T_CmpFile>::StoreAlignmentCandidate( AlignmentCandidate<> &alignment, int alnSegment, T_CmpFile &cmpFile, int moleculeNumber, bool copyQVs) { // // Find out where the movie is going to get stored. // std::string movieName; int holeNumber = 0; bool nameParsedProperly; nameParsedProperly = ParsePBIReadName(alignment.qName, movieName, holeNumber); if (!nameParsedProperly) { std::cout <<"ERROR. Attempting to store a read with name " << alignment.qName << " that does not " << std::endl << "appear to be a PacBio read." << std::endl; exit(1); } unsigned int movieId = StoreMovieInfo(movieName, cmpFile); // Check whether the reference is in /RefInfo. std::map<std::string, int>::iterator mapIt; mapIt = refNameToRefInfoIndex.find(alignment.tName); if (mapIt == refNameToRefInfoIndex.end()) { std::cout << "ERROR. The reference name " << alignment.tName << " was not found in the list of references." << std::endl; std::cout << "Perhaps a different reference file was aligned to than " << std::endl << "what was provided for SAM conversion. " << std::endl; exit(1); } // Store refGroup unsigned int refGroupId = StoreRefGroup(alignment.tName, cmpFile); std::string refGroupName = refNameToRefGroupNameandId[alignment.tName].name; assert(refGroupId == refNameToRefGroupNameandId[alignment.tName].id); if (cmpFile.refGroupIdToArrayIndex.find(refGroupId) == cmpFile.refGroupIdToArrayIndex.end()) { std::cout << "ERROR. The reference ID is not indexed. " << "This is an internal inconsistency." << std::endl; exit(1); } size_t refGroupIndex= cmpFile.refGroupIdToArrayIndex[refGroupId]; assert(refGroupIndex + 1 == refGroupId); std::string path = "/" + refGroupName + "/" + movieName; unsigned int pathId = StorePath(path, cmpFile); vector<unsigned int> alnIndex; alnIndex.resize(22); RemoveGapsAtEndOfAlignment(alignment); /* * Store the alignment string */ vector<unsigned char> byteAlignment; AlignmentToByteAlignment(alignment, alignment.qAlignedSeq, alignment.tAlignedSeq, byteAlignment); unsigned int offsetBegin, offsetEnd; cmpFile.StoreAlnArray(byteAlignment, alignment.tName, movieName, offsetBegin, offsetEnd); // Copy QVs into cmp.h5 if (copyQVs) { std::vector<std::string> optionalQVs; alignment.CopyQVs(&optionalQVs); for (size_t qv_i=0; qv_i<optionalQVs.size(); qv_i++) { std::string *qvName = &alignment.optionalQVNames[qv_i]; std::string *qvString = &optionalQVs[qv_i]; // If the qvString is empty, then the alignment is missing the quality // value if (qvString->size() == 0) { continue; } unsigned int qvOffsetBegin, qvOffsetEnd; if (qvName->compare(qvName->size() - 3, 3, "Tag") == 0) { std::vector<char> qvVector; QVsToCmpH5QVs(*qvString, byteAlignment, true, &qvVector); cmpFile.StoreTags(qvVector, alignment.tName, *qvName, movieName, &qvOffsetBegin, &qvOffsetEnd); } else { std::vector<UChar> qvVector; QVsToCmpH5QVs(*qvString, byteAlignment, false, &qvVector); cmpFile.StoreQVs(qvVector, alignment.tName, *qvName, movieName, &qvOffsetBegin, &qvOffsetEnd); } assert(qvOffsetBegin == offsetBegin); assert(qvOffsetEnd == offsetEnd); } } numAlignments++; DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn; //distScoreFn does not matter since the score is not stored. ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distScoreFn); /* The current AlnIndex column names: (0): "AlnID", "AlnGroupID", "MovieID", "RefGroupID", "tStart", (5): "tEnd", "RCRefStrand", "HoleNumber", "SetNumber", (9): "StrobeNumber", "MoleculeID", "rStart", "rEnd", "MapQV", "nM", (15): "nMM", "nIns", "nDel", "Offset_begin", "Offset_end", (20): "nBackRead", "nReadOverlap" */ if (moleculeNumber == -1) { moleculeNumber = numZMWsPerMovieSpringField * (movieId - 1) + holeNumber; } alnIndex[0] = numAlignments; // AlnId alnIndex[1] = pathId; // AlnGroupID alnIndex[2] = movieId; // MovieID alnIndex[3] = refGroupId; // RefGroupID alnIndex[4] = alignment.tAlignedSeqPos; // tStart alnIndex[5] = alignment.tAlignedSeqPos + alignment.tAlignedSeqLength; // tEnd alnIndex[6] = alignment.tStrand; // RCRefStrand alnIndex[7] = holeNumber; alnIndex[8] = 0; // SET NUMBER -- parse later!!!! alnIndex[9] = alnSegment; // strobenumber alnIndex[10] = moleculeNumber; alnIndex[11] = alignment.qAlignedSeqPos; alnIndex[12] = alignment.qAlignedSeqPos + alignment.qAlignedSeqLength; alnIndex[13] = alignment.mapQV; alnIndex[14] = alignment.nMatch; alnIndex[15] = alignment.nMismatch; alnIndex[16] = alignment.nIns; alnIndex[17] = alignment.nDel; alnIndex[18] = offsetBegin; alnIndex[19] = offsetEnd; alnIndex[20] = 0; alnIndex[21] = 0; cmpFile.alnInfoGroup.WriteAlnIndex(alnIndex); }
int main(int argc, char* argv[]) { #ifdef USE_GOOGLE_PROFILER char *profileFileName = getenv("CPUPROFILE"); if (profileFileName != NULL) { ProfilerStart(profileFileName); } else { ProfilerStart("google_profile.txt"); } #endif // Register inputs and outputs. string samFileName, refFileName, outFileName; CommandLineParser clp; clp.RegisterStringOption("file.sam", &samFileName, "Input SAM file."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads."); clp.RegisterStringOption("out.sam", &outFileName, "Output SAM file."); clp.RegisterPreviousFlagsAsHidden(); // Register filter criteria options. int minAlnLength = 50; float minPctSimilarity = 70, minPctAccuracy = 70; string hitPolicyStr = "randombest"; bool useScoreCutoff = false; int scoreCutoff = INF_INT; int scoreSignInt = -1; RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, minPctAccuracy, hitPolicyStr, useScoreCutoff, scoreSignInt, scoreCutoff); int seed = 1; clp.RegisterIntOption("seed", &seed, "(1) Seed for random number generator.\n" "If seed is 0, then use current time as seed.", CommandLineParser::Integer); string holeNumberStr; Ranges holeNumberRanges; clp.RegisterStringOption("holeNumbers", &holeNumberStr, "A string of comma-delimited hole number ranges to output hits, " "such as '1,2,10-12'. " "This requires hit titles to be in SMRT read title format."); bool parseSmrtTitle = false; clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when filtering alignments generated by " "programs other than blasr, e.g. bwa-sw or gmap. " " Parse read coordinates from the SMRT read title. " "The title is in the format /name/hole/coordinates, where" " coordinates are in the format \\d+_\\d+, and represent " "the interval of the read that was aligned."); /* This experimental option can be useful for metagenomics, in which case * there are hundreds of sequences in the target, of which many titles are * long and may contain white spaces (e.g., ' ', '\t'). * In order to save disc space and avoid the (possibly) none unique mapping * between full and short reference names, one may call blasr with * -titleTable option to represent all target sequences in the output * by their indices in the title table.*/ string titleTableName = ""; clp.RegisterStringOption("titleTable", &titleTableName, "Use this experimental option when filtering alignments generated by " "blasr with -titleTable titleTableName, in which case " "reference titles in SAM are represented by their " "indices (e.g., 0, 1, 2, ...) in the title table."); string adapterGffFileName = ""; clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName, "Use this option to remove reads which can only map to adapters " "specified in the GFF file."); bool verbose = false; clp.RegisterFlagOption("v", &verbose, "Be verbose."); clp.SetExamples( "Because SAM has optional tags that have different meanings" " in different programs, careful usage is required in order " "to have proper output. The \"xs\" tag in bwa-sw is used to " "show the suboptimal score, but in PacBio SAM (blasr) it is " "defined as the start in the query sequence of the alignment.\n" "When \"-smrtTitle\" is specified, the xs tag is ignored, but " "when it is not specified, the coordinates given by the xs and " "xe tags are used to define the interval of a read that is " "aligned. The CIGAR string is relative to this interval."); clp.ParseCommandLine(argc, argv); // Set random number seed. if (seed == 0) { srand(time(NULL)); } else { srand(seed); } scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE; Score s(static_cast<float>(scoreCutoff), scoreSign); FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, minPctAccuracy, true, s); filterCriteria.Verbose(verbose); HitPolicy hitPolicy(hitPolicyStr, scoreSign); string errMsg; if (not filterCriteria.MakeSane(errMsg)) { cout << errMsg << endl; exit(1); } // Parse hole number ranges. if (holeNumberStr.size() != 0) { if (not holeNumberRanges.setRanges(holeNumberStr)) { cout << "Could not parse hole number ranges: " << holeNumberStr << "." << endl; exit(1); } } // Open output file. ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } GFFFile adapterGffFile; if (adapterGffFileName != "") adapterGffFile.ReadAll(adapterGffFileName); SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Filter sam hits."; string program = "samFilter"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); // If the SAM file is generated by blasr with -titleTable, // then references in the SAM are represented by // their corresponding indices in the title table. // In that case, we need to convert reference titles in fasta file // to their corresponding indices in the title table, such that // references in both SAM and fasta files are represented // by title table indices and therefore can match. if (titleTableName != "") { ConvertTitlesToTitleTableIndices(references, titleTableName); } AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet; vector<string> allHeaders = samReader.ReadHeader(alignmentSet); // Process SAM Header. string commandLineString; clp.CommandLineToString(argc, argv, commandLineString); allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \ "\tCL:" + program + " " + commandLineString); for (int i = 0; i < allHeaders.size(); i++) { outFileStrm << allHeaders[i] << endl; } // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that they are ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // Map reference name obtained from SAM file to indices map<string, int> refNameToIndex; for (int i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; int alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // vector<SAMAlignment> allSAMAlignments; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (parseSmrtTitle and holeNumberStr.size() != 0) { string movieName; int thisHoleNumber; if (not ParsePBIReadName(samAlignment.qName, movieName, thisHoleNumber)) { cout << "ERROR, could not parse SMRT title: " << samAlignment.qName << "." << endl; exit(1); } if (not holeNumberRanges.contains(UInt(thisHoleNumber))) { if (verbose) cout << thisHoleNumber << " is not in range." << endl; continue; } } if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process SAM record with 'P' in " << "its cigar string." << endl; continue; } vector<AlignmentCandidate<> > convertedAlignments; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, false); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore multiple segments." << endl; continue; } for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; //score func does not matter DistanceMatrixScoreFunction<DNASequence, DNASequence> distFunc; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distFunc); // Check whether this alignment can only map to adapters in // the adapter GFF file. if (adapterGffFileName != "" and CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) { if (verbose) cout << alignment.qName << " filter adapter only." << endl; continue; } // Assign score to samAlignment. samAlignment.score = samAlignment.as; if (not filterCriteria.Satisfy(static_cast<AlignmentCandidate<> *>(&alignment))) { continue; } allSAMAlignments.push_back( samAlignment ); alignment.FreeSubsequences(); } ++alignIndex; } // Sort all SAM alignments by qName, score and target position. sort(allSAMAlignments.begin(), allSAMAlignments.end(), byQNameScoreTStart); unsigned int groupBegin = 0; unsigned int groupEnd = -1; vector<SAMAlignment> filteredSAMAlignments; while(groupBegin < allSAMAlignments.size()) { // Get the next group of SAM alignments which have the same qName // from allSAMAlignments[groupBegin ... groupEnd) GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd); vector<unsigned int> hitIndices = ApplyHitPolicy( hitPolicy, allSAMAlignments, groupBegin, groupEnd); for(unsigned int i = 0; i < hitIndices.size(); i++) { filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]); } groupBegin = groupEnd; } // Sort all SAM alignments by reference name and query name sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), byRNameQName); for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) { filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm); } if (outFileName != "") { outFileStrm.close(); } #ifdef USE_GOOGLE_PROFILER ProfilerStop(); #endif return 0; }