SAMReader * SAMReader::create( DataSupplier* supplier, const char *fileName, const ReaderContext& context, _int64 startingOffset, _int64 amountOfFileToProcess) { DataReader* data = supplier->getDataReader(maxLineLen); SAMReader *reader = new SAMReader(data, context); reader->init(fileName, startingOffset, amountOfFileToProcess); return reader; }
void WorkerThreadMain(void *param) { ThreadContext *context = (ThreadContext *)param; _int64 rangeStart, rangeLength; SAMReader *samReader = NULL; ReaderContext rcontext; rcontext.clipping = NoClipping; rcontext.genome = genome; rcontext.paired = false; rcontext.defaultReadGroup = ""; while (rangeSplitter->getNextRange(&rangeStart, &rangeLength)) { if (NULL == samReader) { samReader = SAMReader::create(DataSupplier::Default[true], inputFileName, rcontext, rangeStart, rangeLength); } else { ((ReadReader *)samReader)->reinit(rangeStart, rangeLength); } AlignmentResult alignmentResult; unsigned genomeLocation; Direction isRC; unsigned mapQ; unsigned flag; const char *cigar; unsigned nextFileToWrite = 0; Read read; LandauVishkinWithCigar lv; while (samReader->getNextRead(&read, &alignmentResult, &genomeLocation, &isRC, &mapQ, &flag, &cigar)) { if (mapQ < 0 || mapQ > MaxMAPQ) { fprintf(stderr,"Invalid MAPQ: %d\n",mapQ); exit(1); } if (0xffffffff == genomeLocation) { context->nUnaligned++; } else { if (flag & SAM_REVERSE_COMPLEMENT) { read.becomeRC(); } const Genome::Piece *piece = genome->getPieceAtLocation(genomeLocation); if (NULL == piece) { fprintf(stderr,"couldn't find genome piece for offset %u\n",genomeLocation); exit(1); } unsigned offsetA, offsetB; bool matched; const unsigned cigarBufLen = 1000; char cigarForAligned[cigarBufLen]; const char *alignedGenomeData = genome->getSubstring(genomeLocation, 1); int editDistance = lv.computeEditDistance(alignedGenomeData, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarForAligned, cigarBufLen, false); if (editDistance == -1 || editDistance > MaxEditDistance) { editDistance = MaxEditDistance; } // // Parse the read ID. The format is ChrName_OffsetA_OffsetB_?:<more stuff>. This would be simple to parse, except that // ChrName can include "_". So, we parse it by looking for the first : and then working backward. // char idBuffer[10000]; // Hopefully big enough. I'm not worried about malicious input data here. memcpy(idBuffer,read.getId(),read.getIdLength()); idBuffer[read.getIdLength()] = 0; const char *firstColon = strchr(idBuffer,':'); bool badParse = true; size_t chrNameLen; const char *beginningOfSecondNumber; const char *beginningOfFirstNumber; int stage = 0; unsigned offsetOfCorrectChromosome; if (NULL != firstColon && firstColon - 3 > idBuffer && (*(firstColon-1) == '?' || isADigit(*(firstColon - 1)))) { // // We've parsed backwards to see that we have at least #: or ?: where '#' is a digit and ? is literal. If it's // a digit, then scan backwards through that number. // const char *underscoreBeforeFirstColon = firstColon - 2; while (underscoreBeforeFirstColon > idBuffer && isADigit(*underscoreBeforeFirstColon)) { underscoreBeforeFirstColon--; } if (*underscoreBeforeFirstColon == '_' && (isADigit(*(underscoreBeforeFirstColon - 1)) || *(underscoreBeforeFirstColon - 1) == '_')) { stage = 1; if (isADigit(*(underscoreBeforeFirstColon - 1))) { beginningOfSecondNumber = firstColon - 3; while (beginningOfSecondNumber > idBuffer && isADigit(*beginningOfSecondNumber)) { beginningOfSecondNumber--; } beginningOfSecondNumber++; // That loop actually moved us back one char before the beginning; } else { // // There's only one number, we have two consecutive underscores. // beginningOfSecondNumber = underscoreBeforeFirstColon; } if (beginningOfSecondNumber - 2 > idBuffer && *(beginningOfSecondNumber - 1) == '_' && isADigit(*(beginningOfSecondNumber - 2))) { stage = 2; beginningOfFirstNumber = beginningOfSecondNumber - 2; while (beginningOfFirstNumber > idBuffer && isADigit(*beginningOfFirstNumber)) { beginningOfFirstNumber--; } beginningOfFirstNumber++; // Again, we went one too far. offsetA = -1; offsetB = -1; if (*(beginningOfFirstNumber - 1) == '_' && 1 == sscanf(beginningOfFirstNumber,"%u",&offsetA) && ('_' == *beginningOfSecondNumber || 1 == sscanf(beginningOfSecondNumber,"%u", &offsetB))) { stage = 3; chrNameLen = (beginningOfFirstNumber - 1) - idBuffer; char correctChromosomeName[1000]; memcpy(correctChromosomeName, idBuffer, chrNameLen); correctChromosomeName[chrNameLen] = '\0'; if (!genome->getOffsetOfPiece(correctChromosomeName, &offsetOfCorrectChromosome)) { fprintf(stderr, "Couldn't parse chromosome name '%s' from read id\n", correctChromosomeName); } else { badParse = false; } } } } if (badParse) { fprintf(stderr,"Unable to parse read ID '%s', perhaps this isn't simulated data. piecelen = %d, pieceName = '%s', piece offset = %u, genome offset = %u\n", idBuffer, strlen(piece->name), piece->name, piece->beginningOffset, genomeLocation); exit(1); } bool match0 = false; bool match1 = false; if (-1 == offsetA || -1 == offsetB) { matched = false; } else if(strncmp(piece->name, idBuffer, __min(read.getIdLength(), chrNameLen))) { matched = false; } else { if (isWithin(offsetA, genomeLocation - piece->beginningOffset, 50)) { matched = true; match0 = true; } else if (isWithin(offsetB, genomeLocation - piece->beginningOffset, 50)) { matched = true; match1 = true; } else { matched = false; if (flag & SAM_FIRST_SEGMENT) { match0 = true; } else { match1 = true; } } } context->countOfReads[mapQ]++; context->countOfReadsByEditDistance[mapQ][editDistance]++; if (!matched) { context->countOfMisalignments[mapQ]++; context->countOfMisalignmentsByEditDistance[mapQ][editDistance]++; if (70 == mapQ || 69 == mapQ) { // // We don't know which offset is correct, because neither one matched. Just take the one with the lower edit distance. // unsigned correctLocationA = offsetOfCorrectChromosome + offsetA; unsigned correctLocationB = offsetOfCorrectChromosome + offsetB; unsigned correctLocation = 0; const char *correctData = NULL; const char *dataA = genome->getSubstring(correctLocationA, 1); const char *dataB = genome->getSubstring(correctLocationB, 1); int distanceA, distanceB; char cigarA[cigarBufLen]; char cigarB[cigarBufLen]; cigarA[0] = '*'; cigarA[1] = '\0'; cigarB[0] = '*'; cigarB[1] = '\0'; if (dataA == NULL) { distanceA = -1; } else { distanceA = lv.computeEditDistance(dataA, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarA, cigarBufLen, false); } if (dataB == NULL) { distanceB = -1; } else { distanceB = lv.computeEditDistance(dataB, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarB, cigarBufLen, false); } const char *correctGenomeData; char *cigarForCorrect; if (distanceA != -1 && distanceA <= distanceB || distanceB == -1) { correctGenomeData = dataA; correctLocation = correctLocationA; cigarForCorrect = cigarA; } else { correctGenomeData = dataB; correctLocation = correctLocationB; cigarForCorrect = cigarB; } printf("%s\t%d\t%s\t%u\t%d\t%s\t*\t*\t100\t%.*s\t%.*s\tAlignedGenomeLocation:%u\tCorrectGenomeLocation: %u\tCigarForCorrect: %s\tCorrectData: %.*s\tAlignedData: %.*s\n", idBuffer, flag, piece->name, genomeLocation - piece->beginningOffset, mapQ, cigarForAligned, read.getDataLength(), read.getData(), read.getDataLength(), read.getQuality(), genomeLocation, correctLocation, cigarForCorrect, read.getDataLength(), correctGenomeData, read.getDataLength(), alignedGenomeData); } } } } // if it was mapped } // for each read from the sam reader } if (0 == InterlockedAdd64AndReturnNewValue(&nRunningThreads, -1)) { SignalSingleWaiterObject(&allThreadsDone); } }
int main(int argc, char* argv[]) { CommandLineParser clp; string readsFileName; string alignmentsFileName; string outputFileName; float minMergeIdentity = 0.70; clp.RegisterStringOption("reads", &readsFileName, "Reads used for alignments."); clp.RegisterStringOption("alignments", &alignmentsFileName, "SAM formatted alignments."); clp.RegisterIntOption("k", &vertexSize, "Minimum match length", CommandLineParser::PositiveInteger); clp.RegisterStringOption("outfile", &outputFileName, "Alignment output."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("v", &verbose, ""); clp.RegisterFloatOption("minMergeIdentity", &minMergeIdentity, "Minimum identity to merge paths.", CommandLineParser::PositiveFloat); clp.ParseCommandLine(argc, argv); if (minMergeIdentity < 0 or minMergeIdentity > 1) { cout << "ERROR. minMergeIdentity must be between 0 and 1" << endl; exit(1); } vector<FASTASequence> reads; FASTAReader fastaReader; fastaReader.Initialize(readsFileName); fastaReader.ReadAllSequences(reads); // // It is necessary to go from read title to index in the list of reads. // map<string, int> readNameToIndex; BuildReadNameToIndexMap(reads, readNameToIndex); ReadWordMatchVector readWordMatches; InitializeFromReads(reads, readWordMatches); // // Get ready to read in the alignments. // SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader; samReader.Initialize(alignmentsFileName); AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet; samReader.ReadHeader(alignmentSet); SAMAlignment samAlignment; AlignmentCandidate<> alignment; int numAlignedBases = 0; int alignmentIndex = 0; while ( samReader.GetNextAlignment( samAlignment ) ) { vector<AlignmentCandidate<> > alignments; SAMAlignmentsToCandidates(samAlignment, reads, readNameToIndex, alignments, false, true); int i; ++alignmentIndex; int a; for (a = 0; a < alignments.size();a++) { if (alignments[a].qName != alignments[a].tName) { MarkMatches(alignments[a], readNameToIndex, vertexSize, readWordMatches); } } if (alignmentIndex % 1000 == 0) { cout << alignmentIndex << endl; } } int numMatches = 0; int parentIndex = 1; int r; for (r = 0; r < readWordMatches.size(); r++) { readWordMatches[r].CreateParents(); numMatches += readWordMatches[r].pos.size(); } vector<int> parentIndices; parentIndices.resize(2*numMatches + 1); fill(parentIndices.begin(), parentIndices.end(), 0); // // Start indexing off at 1 so that 0 does not need to be treated in // a special case. // int curParentIndex = 1; cout << "There are " << numMatches << " matches." << endl; samReader.Close(); samReader.Initialize(alignmentsFileName); AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet2; samReader.ReadHeader(alignmentSet2); numAlignedBases = 0; alignmentIndex = 0; while ( samReader.GetNextAlignment( samAlignment ) ) { vector<AlignmentCandidate<> > alignments; SAMAlignmentsToCandidates(samAlignment, reads, readNameToIndex, alignments, false, true); int i; ++alignmentIndex; int a; for (a = 0; a < alignments.size();a++) { if (alignments[a].qName != alignments[a].tName) { JoinVertices(alignments[a], vertexSize, readNameToIndex, readWordMatches, curParentIndex, parentIndices); } } if (alignmentIndex % 1000 == 0) { cout << alignmentIndex << endl; } } vector<int> parentCounts; parentCounts.resize(parentIndices.size()); fill(parentCounts.begin(), parentCounts.end(), 0); int p; PromoteAll(parentIndices); int i; for (r = 0; r < readWordMatches.size(); r++) { for (i = 0; i < readWordMatches[r].parents.size(); i++) { readWordMatches[r].parents[i] = parentIndices[readWordMatches[r].parents[i]]; parentCounts[readWordMatches[r].parents[i]]++; } } /* for (i = 0; i < readWordMatches.size(); i++) { readWordMatches[i].PrintPos(cout); readWordMatches[i].PrintParents(cout); } */ map<int,int> hist; int numParents = 0; for (i = 1; i < parentCounts.size() && parentIndices[i] != 0; i++) { if (parentCounts[i] != 0) { ++numParents; } if (hist.find(parentCounts[i]) == hist.end()) { hist[parentCounts[i]] = 1; } else { hist[parentCounts[i]]++; } } map<int,int>::iterator histIt; cout << " freq count" << endl; for(histIt = hist.begin(); histIt != hist.end(); ++histIt) { cout << (*histIt).second << " " << (*histIt).first << endl; } MatchVertexList vertices; vertices.resize(numParents); cout << "there are " << numParents << " parents. " << endl; }
int main(int argc, char* argv[]) { string program = "samtoh5"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, cmpFileName, refFileName; bool parseSmrtTitle = false; bool useShortRefName = false; CommandLineParser clp; string readType = "standard"; int verbosity = 0; clp.SetProgramName(program); clp.SetProgramSummary("Converts in.sam file to out.cmp.h5 file."); clp.SetVersion(versionString); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file.", true); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads.", true); clp.RegisterStringOption("out.cmp.h5", &cmpFileName, "Output cmp.h5 file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when converting alignments " "generated from reads produced by the " "pls2fasta from bas.h5 files by parsing read " "coordinates from the SMRT read title. The title " "is in the format /name/hole/coordinates, where " "coordinates are in the format \\d+_\\d+, and " "represent the interval of the read that was " "aligned."); clp.RegisterStringOption("readType", &readType, "Set the read type: 'standard', 'strobe', 'CCS', " "or 'cDNA'"); clp.RegisterIntOption("verbosity", &verbosity, "Set desired verbosity.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); string description = ("Because SAM has optional tags that have different " "meanings in different programs, careful usage is required in order to " "have proper output. The \"xs\" tag in bwa-sw is used to show the " "suboptimal score, but in PacBio SAM (blasr) it is defined as the start " "in the query sequence of the alignment.\nWhen \"-smrtTitle\" is " "specified, the xs tag is ignored, but when it is not specified, the " "coordinates given by the xs and xe tags are used to define the interval " "of a read that is aligned. The CIGAR string is relative to this interval."); clp.SetExamples(description); clp.ParseCommandLine(argc, argv); if (readType != "standard" and readType != "strobe" and readType != "cDNA" and readType != "CCS") { cout << "ERROR. Read type '" << readType << "' must be one of either 'standard', 'strobe', 'cDNA' or 'CCS'." << endl; exit(1); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader; FASTAReader fastaReader; HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > cmpFile; // // Initialize input/output files. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); cmpFile.Create(cmpFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Convert sam to cmp.h5"; cmpFile.fileLogGroup.AddEntry(command, log, program, GetTimestamp(), versionString); // // Set the readType // cmpFile.SetReadType(readType); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); // // This should probably be handled by the alignmentSetAdapter, but // time constraints... // AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // // Always recompute the MD5 values even if they exist in the input // sam file. Because MD5 is defined differently in sam and cmp.h5 files. // The SAM convention uppercases and normalizes before computing the MD5. // For cmp.h5, we compute the MD5 on the sequence 'as is'. // for(int i = 0; i < alignmentSet.references.size(); i++) { MakeMD5((const char*)&references[i].seq[0], (unsigned int)references[i].length, alignmentSet.references[i].md5); } // // Map short names for references obtained from file.sam to full names obtained from reference.fasta // map<string, string> shortRefNameToFull; map<string, string>::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (int i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // // Start setting up the cmp.h5 file. // AlignmentSetToCmpH5Adapter<HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > > alignmentSetAdapter; alignmentSetAdapter.Initialize(); alignmentSetAdapter.StoreReferenceInfo(alignmentSet.references, cmpFile); // // Store the alignments. // SAMAlignment samAlignment; int alignIndex = 0; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } vector<AlignmentCandidate<> > convertedAlignments; if (verbosity > 0) { cout << "Storing alignment for " << samAlignment.qName << endl; } SAMAlignmentsToCandidates(samAlignment, references, alignmentSetAdapter.refNameToIndex, convertedAlignments, parseSmrtTitle, false); alignmentSetAdapter.StoreAlignmentCandidateList(convertedAlignments, cmpFile, alignIndex); int a; for (a = 0; a < convertedAlignments.size(); a++) { convertedAlignments[a].FreeSubsequences(); } ++alignIndex; /* if (alignIndex == 100) { return 0; }*/ } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; return 0; }
int main(int argc, char* argv[]) { string program = "samtom4"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, refFileName, outFileName; bool printHeader = false; bool parseSmrtTitle = false; bool useShortRefName = false; CommandLineParser clp; clp.SetProgramName(program); clp.SetVersion(versionString); clp.SetProgramSummary("Converts a SAM file generated by blasr to M4 format."); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file, which is produced by blasr."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate file.sam."); clp.RegisterStringOption("out.m4", &outFileName, "Output in blasr M4 format."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("header", &printHeader, "Print M4 header."); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); //clp.SetExamples(program + " file.sam reference.fasta out.m4"); clp.ParseCommandLine(argc, argv); ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // // Map short names for references obtained from file.sam to // full names obtained from reference.fasta // map<string, string> shortRefNameToFull; map<string, string>::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (size_t i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // Map reference name obtained from SAM file to indices map<string, int> refNameToIndex; for (size_t i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; size_t alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // if (printHeader) IntervalOutput::PrintHeader(*outFilePtr); // The socre matrix does not matter because we will use the // aligner's score from SAM file anyway. DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } // The padding character 'P' is not supported if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process sam record with 'P' in its cigar string." << endl; continue; } vector<AlignmentCandidate<> > convertedAlignments; // // Keep reference as forward. // So if IsReverseComplement(sam.flag)==true, then qStrand is reverse // and tStrand is forward. // bool keepRefAsForward = false; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, keepRefAsForward); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore an alignment which has multiple segments." << endl; continue; } //all alignments are unique single-ended alignments. for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distScoreFn); // Use aligner's score from SAM file anyway. alignment.score = samAlignment.as; alignment.mapQV = samAlignment.mapQV; // Since SAM only has the aligned sequence, many info of the // original query (e.g. the full length) is missing. // Overwrite alignment.qLength (which is length of the query // in the SAM alignment) with xq (which is the length of the // original query sequence saved by blasr) right before printing // the output so that one can reconstruct a blasr m4 record from // a blasr sam alignment. if (samAlignment.xq!=0) alignment.qLength = samAlignment.xq; IntervalOutput::PrintFromSAM(alignment, *outFilePtr); alignment.FreeSubsequences(); } ++alignIndex; } if (outFileName != "") { outFileStrm.close(); } return 0; }
int main(int argc, char* argv[]) { #ifdef USE_GOOGLE_PROFILER char *profileFileName = getenv("CPUPROFILE"); if (profileFileName != NULL) { ProfilerStart(profileFileName); } else { ProfilerStart("google_profile.txt"); } #endif // Register inputs and outputs. string samFileName, refFileName, outFileName; CommandLineParser clp; clp.RegisterStringOption("file.sam", &samFileName, "Input SAM file."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads."); clp.RegisterStringOption("out.sam", &outFileName, "Output SAM file."); clp.RegisterPreviousFlagsAsHidden(); // Register filter criteria options. int minAlnLength = 50; float minPctSimilarity = 70, minPctAccuracy = 70; string hitPolicyStr = "randombest"; bool useScoreCutoff = false; int scoreCutoff = INF_INT; int scoreSignInt = -1; RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, minPctAccuracy, hitPolicyStr, useScoreCutoff, scoreSignInt, scoreCutoff); int seed = 1; clp.RegisterIntOption("seed", &seed, "(1) Seed for random number generator.\n" "If seed is 0, then use current time as seed.", CommandLineParser::Integer); string holeNumberStr; Ranges holeNumberRanges; clp.RegisterStringOption("holeNumbers", &holeNumberStr, "A string of comma-delimited hole number ranges to output hits, " "such as '1,2,10-12'. " "This requires hit titles to be in SMRT read title format."); bool parseSmrtTitle = false; clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when filtering alignments generated by " "programs other than blasr, e.g. bwa-sw or gmap. " " Parse read coordinates from the SMRT read title. " "The title is in the format /name/hole/coordinates, where" " coordinates are in the format \\d+_\\d+, and represent " "the interval of the read that was aligned."); /* This experimental option can be useful for metagenomics, in which case * there are hundreds of sequences in the target, of which many titles are * long and may contain white spaces (e.g., ' ', '\t'). * In order to save disc space and avoid the (possibly) none unique mapping * between full and short reference names, one may call blasr with * -titleTable option to represent all target sequences in the output * by their indices in the title table.*/ string titleTableName = ""; clp.RegisterStringOption("titleTable", &titleTableName, "Use this experimental option when filtering alignments generated by " "blasr with -titleTable titleTableName, in which case " "reference titles in SAM are represented by their " "indices (e.g., 0, 1, 2, ...) in the title table."); string adapterGffFileName = ""; clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName, "Use this option to remove reads which can only map to adapters " "specified in the GFF file."); bool verbose = false; clp.RegisterFlagOption("v", &verbose, "Be verbose."); clp.SetExamples( "Because SAM has optional tags that have different meanings" " in different programs, careful usage is required in order " "to have proper output. The \"xs\" tag in bwa-sw is used to " "show the suboptimal score, but in PacBio SAM (blasr) it is " "defined as the start in the query sequence of the alignment.\n" "When \"-smrtTitle\" is specified, the xs tag is ignored, but " "when it is not specified, the coordinates given by the xs and " "xe tags are used to define the interval of a read that is " "aligned. The CIGAR string is relative to this interval."); clp.ParseCommandLine(argc, argv); // Set random number seed. if (seed == 0) { srand(time(NULL)); } else { srand(seed); } scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE; Score s(static_cast<float>(scoreCutoff), scoreSign); FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, minPctAccuracy, true, s); filterCriteria.Verbose(verbose); HitPolicy hitPolicy(hitPolicyStr, scoreSign); string errMsg; if (not filterCriteria.MakeSane(errMsg)) { cout << errMsg << endl; exit(1); } // Parse hole number ranges. if (holeNumberStr.size() != 0) { if (not holeNumberRanges.setRanges(holeNumberStr)) { cout << "Could not parse hole number ranges: " << holeNumberStr << "." << endl; exit(1); } } // Open output file. ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } GFFFile adapterGffFile; if (adapterGffFileName != "") adapterGffFile.ReadAll(adapterGffFileName); SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Filter sam hits."; string program = "samFilter"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); // If the SAM file is generated by blasr with -titleTable, // then references in the SAM are represented by // their corresponding indices in the title table. // In that case, we need to convert reference titles in fasta file // to their corresponding indices in the title table, such that // references in both SAM and fasta files are represented // by title table indices and therefore can match. if (titleTableName != "") { ConvertTitlesToTitleTableIndices(references, titleTableName); } AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet; vector<string> allHeaders = samReader.ReadHeader(alignmentSet); // Process SAM Header. string commandLineString; clp.CommandLineToString(argc, argv, commandLineString); allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \ "\tCL:" + program + " " + commandLineString); for (int i = 0; i < allHeaders.size(); i++) { outFileStrm << allHeaders[i] << endl; } // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that they are ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // Map reference name obtained from SAM file to indices map<string, int> refNameToIndex; for (int i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; int alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // vector<SAMAlignment> allSAMAlignments; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (parseSmrtTitle and holeNumberStr.size() != 0) { string movieName; int thisHoleNumber; if (not ParsePBIReadName(samAlignment.qName, movieName, thisHoleNumber)) { cout << "ERROR, could not parse SMRT title: " << samAlignment.qName << "." << endl; exit(1); } if (not holeNumberRanges.contains(UInt(thisHoleNumber))) { if (verbose) cout << thisHoleNumber << " is not in range." << endl; continue; } } if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process SAM record with 'P' in " << "its cigar string." << endl; continue; } vector<AlignmentCandidate<> > convertedAlignments; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, false); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore multiple segments." << endl; continue; } for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; //score func does not matter DistanceMatrixScoreFunction<DNASequence, DNASequence> distFunc; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distFunc); // Check whether this alignment can only map to adapters in // the adapter GFF file. if (adapterGffFileName != "" and CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) { if (verbose) cout << alignment.qName << " filter adapter only." << endl; continue; } // Assign score to samAlignment. samAlignment.score = samAlignment.as; if (not filterCriteria.Satisfy(static_cast<AlignmentCandidate<> *>(&alignment))) { continue; } allSAMAlignments.push_back( samAlignment ); alignment.FreeSubsequences(); } ++alignIndex; } // Sort all SAM alignments by qName, score and target position. sort(allSAMAlignments.begin(), allSAMAlignments.end(), byQNameScoreTStart); unsigned int groupBegin = 0; unsigned int groupEnd = -1; vector<SAMAlignment> filteredSAMAlignments; while(groupBegin < allSAMAlignments.size()) { // Get the next group of SAM alignments which have the same qName // from allSAMAlignments[groupBegin ... groupEnd) GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd); vector<unsigned int> hitIndices = ApplyHitPolicy( hitPolicy, allSAMAlignments, groupBegin, groupEnd); for(unsigned int i = 0; i < hitIndices.size(); i++) { filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]); } groupBegin = groupEnd; } // Sort all SAM alignments by reference name and query name sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), byRNameQName); for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) { filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm); } if (outFileName != "") { outFileStrm.close(); } #ifdef USE_GOOGLE_PROFILER ProfilerStop(); #endif return 0; }