int main(int argc, char* argv[]) { string plsFileName; int advance; if (argc <= 2) { cout << "usage: testAdvance file.pls.h5 advance " << endl; cout << "move 'advance' reads forward in a file." << endl; exit(1); } plsFileName = argv[1]; advance = atoi(argv[2]); ReaderAgglomerate reader; reader.Initialize(plsFileName); SMRTSequence seq; int seqIndex = 0; int i; for (i = 0; i < 4; i++ ){ seq.Free(); reader.Advance(advance); reader.GetNext(seq); } seq.PrintSeq(cout); }
bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader, MappingParameters ¶ms, T_Sequence &read, string & readGroupId, int & associatedRandInt, MappingSemaphores & semaphores) { // Wait on a semaphore if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.reader); #else sem_wait(&semaphores.reader); #endif } bool returnValue = true; // // CCS Reads are read differently from other reads. Do static casting here // of this. // if (reader.GetNext(read, associatedRandInt) == 0) { returnValue = false; } // // Set the read group id before releasing the semaphore, since other // threads may change the reader object to a new read group before // sending this alignment out to printing. readGroupId = reader.readGroupId; if (params.nProc > 1) { #ifdef __APPLE__ sem_post(semaphores.reader); #else sem_post(&semaphores.reader); #endif } return returnValue; }
int main(int argc, char* argv[]) { string plsFileName, fastaOutName; if (argc < 2) { cout << "usage: pls2fasta file.pls.h5 file.fasta " << endl; cout << "Print reads stored in hdf as fasta." << endl; exit(1); } vector<string> plsFileNames; plsFileName = argv[1]; fastaOutName = argv[2]; if (FileOfFileNames::IsFOFN(plsFileName)) { FileOfFileNames::FOFNToList(plsFileName, plsFileNames); } else { plsFileNames.push_back(plsFileName); } int plsFileIndex; for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) { ReaderAgglomerate reader; reader.IgnoreCCS(); reader.Initialize(plsFileNames[plsFileIndex]); ofstream fastaOut; CrucialOpen(fastaOutName, fastaOut); SMRTSequence seq; int seqIndex = 0; while (reader.GetNext(seq)) { seq.PrintQualSeq(fastaOut); } } }
int main(int argc, char* argv[]) { string queryFileName, targetFileName; if (argc < 3) { cout << "Usage: guidedalign query target [sdptuple]" << endl; exit(1); } queryFileName = argv[1]; targetFileName = argv[2]; int sdpTupleSize = 4; if (argc > 3) { sdpTupleSize = atoi(argv[3]); } ReaderAgglomerate reader; FASTQSequence query, target; reader.Initialize(queryFileName); reader.GetNext(query); reader.Close(); reader.Initialize(targetFileName); reader.GetNext(target); reader.Close(); int alignScore; /* Alignment sdpAlignment; int nSDPHits = 0; alignScore = SDPAlign(query, target, SMRTDistanceMatrix, 4, 4, sdpTupleSize, 4, 0.90, sdpAlignment, nSDPHits, Local, false, false); int b; for (b = 0; b < sdpAlignment.blocks.size(); b++) { sdpAlignment.blocks[b].qPos += sdpAlignment.qPos; sdpAlignment.blocks[b].tPos += sdpAlignment.tPos; } Guide guide; int bandSize = 16; AlignmentToGuide(sdpAlignment, guide, bandSize); StoreMatrixOffsets(guide); int guideSize = ComputeMatrixNElem(guide); int i; */ vector<int> scoreMat; vector<Arrow> pathMat; vector<double> probMat, optPathProbMat; vector<float> lnSubVect, lnInsVect, lnDelVect, lnMatchVect; // AlignmentCandidate<FASTASequence, FASTASequence> alignment; Alignment alignment; DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn; distScoreFn.del = 3; distScoreFn.ins = 3; distScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); alignScore = GuidedAlign(query, target, distScoreFn, 10, // in order after edit distance: // pairwise-ins, pairwise-del, k, sdp-ins, sdp-del, sdp-insrate // distScoreFn, 5,5,.15, alignment, Local, false, 8); // StickPrintAlignment(alignment, query, target, cout); }
int main(int argc, char* argv[]) { string fileAName, fileBName; if (argc < 3) { cout << "usage: extendAlign file1 fil2 [pos1 pos2] " << endl; exit(0); } fileAName = argv[1]; fileBName = argv[2]; int argi = 3; int aPos = 0; int bPos = 0; if (argc == 5) { aPos = atoi(argv[3]); bPos = atoi(argv[4]); } ReaderAgglomerate reader; reader.Initialize(fileAName); FASTASequence aSeq, bSeq; reader.GetNext(aSeq); reader.Initialize(fileBName); reader.GetNext(bSeq); DistanceMatrixScoreFunction<FASTASequence, FASTASequence> scoreFn; scoreFn.ins = 3; scoreFn.del = 3; scoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); vector<int> scoreMat; vector<Arrow>pathMat; AlignmentCandidate<FASTASequence, FASTASequence> extendedAlignment; /* ExtendAlignmentForward(aSeq, aPos, bSeq, bPos, 5, //k scoreMat, pathMat, extendedAlignment, scoreFn, 1, // don't bother attempting // to extend the alignment // if one of the sequences // is less than 1 base long 2); extendedAlignment.qAlignedSeq.ReferenceSubstring(aSeq); extendedAlignment.tAlignedSeq.ReferenceSubstring(bSeq); // extendedAlignment.qAlignedSeqPos = aPos; // extendedAlignment.tAlignedSeqPos = bPos; StickPrintAlignment(extendedAlignment, aSeq, bSeq, cout); extendedAlignment.Clear(); */ if (aPos == 0) { aPos = aSeq.length; } if (bPos == 0) { bPos = bSeq.length; } ExtendAlignmentReverse(aSeq, aPos, bSeq, bPos, 5, //k scoreMat, pathMat, extendedAlignment, scoreFn, 1, // don't bother attempting // to extend the alignment // if one of the sequences // is less than 1 base long 2); extendedAlignment.qAlignedSeq.ReferenceSubstring(aSeq); extendedAlignment.tAlignedSeq.ReferenceSubstring(bSeq); // extendedAlignment.qAlignedSeqPos = aPos; // extendedAlignment.tAlignedSeqPos = bPos; StickPrintAlignment(extendedAlignment, aSeq, bSeq, cout); return 0; }
int main(int argc, char* argv[]) { string program = "pls2fasta"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string plsFileName, fastaOutName; vector<string> plsFileNames; bool trimByRegion, maskByRegion; trimByRegion = false; maskByRegion = false; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector<string> regionFileNames; bool splitSubreads = true; int minSubreadLength = 0; bool addSimulatedData = false; bool printSimulatedCoordinate = false; bool printSimulatedSequenceIndex = false; bool printFastq = false; bool printCcs = false; int lineLength = 50; int minReadScore = 0; vector<int> holeNumbers; CommandLineParser clp; bool printOnlyBest = false; clp.SetProgramName(program); clp.SetVersion(versionString); clp.RegisterStringOption("in.pls.h5", &plsFileName, "Input pls.h5/bax.h5/fofn file.", true); clp.RegisterStringOption("out.fasta", &fastaOutName, "Output fasta/fastq file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("trimByRegion", &trimByRegion, "Trim away low quality regions."); clp.RegisterFlagOption("maskByRegion", &maskByRegion, "Mask low quality regions with 'N'."); clp.RegisterStringOption("regionTable", ®ionsFOFNName, "Optional HDF file with a /PulseData/Regions dataset."); clp.RegisterIntOption("minSubreadLength", &minSubreadLength, "Do not write subreads less than the specified length.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("noSplitSubreads", &splitSubreads, "Do not split reads on adapter sequences."); clp.RegisterIntListOption("holeNumber", &holeNumbers, "Only print this hole number (or list of numbers)."); clp.RegisterFlagOption("fastq", &printFastq, "Print in FASTQ format with quality."); clp.RegisterFlagOption("ccs", &printCcs, "Print de novo CCS sequences"); clp.RegisterIntOption("lineLength", &lineLength, "Specify fasta/fastq line length", CommandLineParser::PositiveInteger); clp.RegisterIntOption("minReadScore", &minReadScore, "Minimum read score to print a read. The score is " "a number between 0 and 1000 and represents the expected accuracy percentage * 10. " "A typical value would be between 750 and 800. This does not apply to ccs reads.", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("best", &printOnlyBest, "If a CCS sequence exists, print this. Otherwise, print the longest" "subread. This does not support fastq."); string description = ("Converts pls.h5/bax.h5/fofn files to fasta or fastq files. Although fasta files are provided" " with every run, they are not trimmed nor split into subreads. This program takes " "additional annotation information, such as the subread coordinates and high quality regions " "and uses them to create fasta sequences that are substrings of all bases called. Most of the time " "you will want to trim low quality reads, so you should specify -trimByRegion."); clp.SetProgramSummary(description); clp.ParseCommandLine(argc, argv); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; if (trimByRegion and maskByRegion) { cout << "ERROR! You cannot both trim and mask regions. Use one or the other." << endl; exit(1); } if (printFastq) { // Setting lineLength to 0 flags to print on one line. lineLength = 0; } if (FileOfFileNames::IsFOFN(plsFileName)) { FileOfFileNames::FOFNToList(plsFileName, plsFileNames); } else { plsFileNames.push_back(plsFileName); } if (regionsFOFNName == "") { regionFileNames = plsFileNames; } else { if (FileOfFileNames::IsFOFN(regionsFOFNName)) { FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames); } else { regionFileNames.push_back(regionsFOFNName); } } ofstream fastaOut; CrucialOpen(fastaOutName, fastaOut); int plsFileIndex; HDFRegionTableReader hdfRegionReader; sort(holeNumbers.begin(), holeNumbers.end()); for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) { if (trimByRegion or maskByRegion or splitSubreads) { hdfRegionReader.Initialize(regionFileNames[plsFileIndex]); hdfRegionReader.ReadTable(regionTable); regionTable.SortTableByHoleNumber(); } ReaderAgglomerate reader; HDFBasReader ccsReader; if (printOnlyBest) { ccsReader.SetReadBasesFromCCS(); ccsReader.Initialize(plsFileNames[plsFileIndex]); } if (printCcs == false) { reader.IgnoreCCS(); } else { reader.hdfBasReader.SetReadBasesFromCCS(); } if (addSimulatedData) { reader.hdfBasReader.IncludeField("SimulatedCoordinate"); reader.hdfBasReader.IncludeField("SimulatedSequenceIndex"); } if (reader.SetReadFileName(plsFileNames[plsFileIndex]) == 0) { cout << "ERROR, could not determine file type." << plsFileNames[plsFileIndex] << endl; exit(1); } if (reader.Initialize() == 0) { cout << "ERROR, could not initialize file " << plsFileNames[plsFileIndex] << endl; exit(1); } DNALength simulatedCoordinate; DNALength simulatedSequenceIndex; reader.SkipReadQuality(); SMRTSequence seq; vector<ReadInterval> subreadIntervals;; SMRTSequence ccsSeq; while (reader.GetNext(seq)) { if (printOnlyBest) { ccsReader.GetNext(ccsSeq); } if (holeNumbers.size() != 0 and binary_search(holeNumbers.begin(), holeNumbers.end(), seq.zmwData.holeNumber) == false) { continue; } if (seq.length == 0) { continue; } if (addSimulatedData) { reader.hdfBasReader.simulatedCoordinateArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedCoordinate); reader.hdfBasReader.simulatedSequenceIndexArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedSequenceIndex); } if (printCcs == true) { if (printFastq == false) { seq.PrintSeq(fastaOut); } else { seq.PrintFastq(fastaOut, lineLength); } continue; } // // Determine the high quality boundaries of the read. This is // the full read is no hq regions exist, or it is stated to // ignore regions. // DNALength hqReadStart, hqReadEnd; int hqRegionScore; if (GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, hqRegionScore) == false or (trimByRegion == false and maskByRegion == false)) { hqReadStart = 0; hqReadEnd = seq.length; } // // Mask off the low quality portions of the reads. // if (maskByRegion) { if (hqReadStart > 0) { fill(&seq.seq[0], &seq.seq[hqReadStart], 'N'); } if (hqReadEnd != seq.length) { fill(&seq.seq[hqReadEnd], &seq.seq[seq.length], 'N'); } } // // Now possibly print the full read with masking. This could be handled by making a // if (splitSubreads == false) { ReadInterval wholeRead(0, seq.length); // The set of subread intervals is just the entire read. subreadIntervals.clear(); subreadIntervals.push_back(wholeRead); } else { // // Print subread coordinates no matter whether or not reads have subreads. // subreadIntervals.clear(); // clear old, new intervals are appended. CollectSubreadIntervals(seq, ®ionTable, subreadIntervals); } // // Output all subreads as separate sequences. // int intvIndex; SMRTSequence bestSubreadSequence; int bestSubreadScore = -1; int bestSubreadIndex = 0; int bestSubreadStart = 0, bestSubreadEnd = 0; SMRTSequence bestSubread; for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { SMRTSequence subreadSequence, subreadSequenceRC; subreadSequence.subreadStart = subreadIntervals[intvIndex].start; subreadSequence.subreadEnd = subreadIntervals[intvIndex].end; // // When trimming by region, only output the parts of the // subread that overlap the hq region. // if (trimByRegion == true) { subreadSequence.subreadStart = max((DNALength) subreadIntervals[intvIndex].start, hqReadStart); subreadSequence.subreadEnd = min((DNALength) subreadIntervals[intvIndex].end, hqReadEnd); } if (subreadSequence.subreadStart >= subreadSequence.subreadEnd or subreadSequence.subreadEnd - subreadSequence.subreadStart <= minSubreadLength) { // // There is no high qualty portion of this subread. Skip it. // continue; } if (hqRegionScore < minReadScore) { continue; } // // Print the subread, adding the coordinates as part of the title. // subreadSequence.ReferenceSubstring(seq, subreadSequence.subreadStart, subreadSequence.subreadEnd - subreadSequence.subreadStart); stringstream titleStream; titleStream << seq.title; if (splitSubreads) { // // Add the subread coordinates if splitting on subread. // titleStream << "/" << subreadSequence.subreadStart << "_" << subreadSequence.subreadEnd; } // // If running on simulated data, add where the values were simulated from. // if (addSimulatedData) { titleStream << ((FASTASequence*)&seq)->title << "/chrIndex_" << simulatedSequenceIndex << "/position_"<< simulatedCoordinate; ((FASTASequence*)&seq)->CopyTitle(titleStream.str()); } subreadSequence.CopyTitle(titleStream.str()); // // Eventually replace with WriterAgglomerate. // if (printOnlyBest == false) { if (subreadSequence.length > 0) { if (printFastq == false) { ((FASTASequence*)&subreadSequence)->PrintSeq(fastaOut); } else { subreadSequence.PrintFastq(fastaOut, lineLength); } } delete[] subreadSequence.title; } else { int subreadWeightedScore = subreadSequence.length * hqRegionScore; if (subreadWeightedScore > bestSubreadScore) { bestSubreadIndex = intvIndex; bestSubread = subreadSequence; bestSubreadScore = subreadWeightedScore; } } } if (printOnlyBest) { if (ccsSeq.length > 0) { if (printFastq == false) { ccsSeq.PrintSeq(fastaOut); } else { ccsSeq.PrintFastq(fastaOut, ccsSeq.length); } } else { if (bestSubreadScore >= 0) { if (printFastq == false) { bestSubread.PrintSeq(fastaOut); } else { bestSubread.PrintFastq(fastaOut, bestSubread.length); } bestSubread.Free(); } } ccsSeq.Free(); } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; }
int main(int argc, char* argv[]) { string genomeFileName, readsFileName; TupleMetrics tm; float insRate = 0.10; tm.tupleSize = 8; CommandLineParser clp; int nProcessors = 1; clp.SetProgramName("exhalign"); clp.SetProgramSummary("Count the number of occurrences of every k-mer in a file."); clp.RegisterStringOption("genome", &genomeFileName, "The file of the genome to align to."); clp.RegisterStringOption("reads", &readsFileName, "The reads to align."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterIntOption("wordsize", &tm.tupleSize, "Size of words to count", CommandLineParser::NonNegativeInteger); clp.RegisterFloatOption("insrate", &insRate, "Roughly the insertion rate (10%)", CommandLineParser::NonNegativeFloat); clp.RegisterIntOption("nProc", &nProcessors, "Number of processors to use", CommandLineParser::NonNegativeInteger); clp.ParseCommandLine(argc, argv); insRate+=1.0; // // Process the reads into a vector of read keywords // vector<string> readsFileNames; vector<FASTQSequence> reads; vector<vector<ReadKeyword> > keywords; SMRTSequence seq, seqRC; ReadKeyword keyword; int readIndex = 0; if (FileOfFileNames::IsFOFN(readsFileName)) { FileOfFileNames::FOFNToList(readsFileName, readsFileNames); } else { readsFileNames.push_back(readsFileName); } ReaderAgglomerate genomeReader; HDFRegionTableReader regionTableReader; genomeReader.Initialize(genomeFileName); FASTQSequence genome; genomeReader.GetNext(genome); SubreadIterator subreadIterator; keywords.resize(nProcessors); RegionTable regionTable, *regionTablePtr; int readsFileIndex; for (readsFileIndex = 0; readsFileIndex < readsFileNames.size(); readsFileIndex++ ) { ReaderAgglomerate reader; reader.Initialize(readsFileNames[readsFileIndex]); regionTalePtr = NULL; if (reader.fileType == HDFPulse or reader.fileType == HDFBase) { regionTableReader.Initialize(readsFileNames[readsFileIndex]); regionTableReader.Read(regionTable); regionTablePtr = ®ionTable; } else { regionTablePtr = NULL; } SMRTSequence fullSequence; while(reader.GetNext(fullSequence)) { subreadIterator.Initialize(&fullSequence, regionTablePtr); SMRTSequence seq; while (subreadIterator.GetNext(seq)) { DNALength pos; if (seq.length < tm.tupleSize) continue; reads.push_back(seq); for (pos = 0; pos < seq.length - tm.tupleSize + 1; pos++) { keyword.tuple.FromStringLR(&seq.seq[pos], tm); keyword.readPos = pos; keyword.readIndex = readIndex; keywords[(readIndex/2)%nProcessors].push_back(keyword); } readIndex++; seq.MakeRC(seqRC); reads.push_back(seqRC); for (pos = 0; pos < seqRC.length - tm.tupleSize + 1; pos++) { keyword.tuple.FromStringLR(&seqRC.seq[pos], tm); keyword.readPos = pos; keyword.readIndex = readIndex; keywords[(readIndex/2)%nProcessors].push_back(keyword); } readIndex++; // seq.Free(); seqRC.Free(); } fullSequence.Free(); } } int procIndex; for (procIndex = 0; procIndex < nProcessors; procIndex++) { std::sort(keywords[procIndex].begin(), keywords[procIndex].end()); } std::vector<int> prevAlignedGenomePos; std::vector<int> readOptScore; std::vector<FastqAlignment > optAlignment; std::vector<int> optGenomeAlignPos; std::vector<int> optGenomeAlignLength; prevAlignedGenomePos.resize(reads.size()); readOptScore.resize(reads.size()); optAlignment.resize(reads.size()); optGenomeAlignPos.resize(reads.size()); optGenomeAlignLength.resize(reads.size()); vector<Data> tdata; tdata.resize(nProcessors); std::fill(prevAlignedGenomePos.begin(), prevAlignedGenomePos.end(), -1); for (procIndex = 0; procIndex < nProcessors; procIndex++) { tdata[procIndex].prevAlignedGenomePos = &prevAlignedGenomePos; tdata[procIndex].readOptScore = &readOptScore; tdata[procIndex].optAlignment = &optAlignment; tdata[procIndex].optGenomeAlignPos = &optGenomeAlignPos; tdata[procIndex].optGenomeAlignLength = &optGenomeAlignLength; tdata[procIndex].keywords = &keywords[procIndex]; tdata[procIndex].genome = &genome; tdata[procIndex].insRate = insRate; tdata[procIndex].reads = &reads; tdata[procIndex].tm = &tm; } if (nProcessors == 1) { KeywordSeededAlignment(&tdata[0]); } else { pthread_t *threads = new pthread_t[nProcessors]; pthread_attr_t *threadAttr = new pthread_attr_t[nProcessors]; for (procIndex = 0; procIndex < nProcessors; procIndex++) { pthread_attr_init(&threadAttr[procIndex]); pthread_create(&threads[procIndex], &threadAttr[procIndex], (void*(*)(void*))KeywordSeededAlignment, &tdata[procIndex]); } for (procIndex = 0; procIndex < nProcessors; procIndex++) { pthread_join(threads[procIndex], NULL); } } VectorIndex i; // cout << "printing alignments for " << reads.size() << " reads." << endl; for (readIndex = 0; readIndex < readOptScore.size(); readIndex +=2 ){ int optIndex = readIndex; if (readOptScore[readIndex] > readOptScore[readIndex+1]) { optIndex= readIndex + 1; } FASTQSequence genomeSubstring; genomeSubstring.seq = &genome.seq[optGenomeAlignPos[optIndex]]; genomeSubstring.length = optGenomeAlignLength[optIndex]; if (prevAlignedGenomePos[optIndex] >= 0) { optAlignment[optIndex].qName.assign(reads[optIndex].title, reads[optIndex].titleLength); optAlignment[optIndex].tName.assign(genome.GetName()); ComputeAlignmentStats(optAlignment[optIndex], reads[optIndex].seq, genomeSubstring.seq, SMRTDistanceMatrix, 6, 6); if (optAlignment[optIndex].blocks.size() > 0) { PrintCompareSequencesAlignment(optAlignment[optIndex], reads[optIndex], genomeSubstring,cout); } /* StickPrintAlignment(optAlignment[optIndex], reads[optIndex], genomeSubstring, cout, 0, optGenomeAlignPos[optIndex]); */ } } for (readIndex = 0; readIndex < readOptScore.size(); readIndex++ ) { reads[readIndex].Free(); } return 0; }
int main(int argc, char* argv[]) { string inputFileName, outputFileName; if (argc < 2) { PrintUsage(); exit(0); } vector<string> inputFileNames; inputFileName = argv[1]; outputFileName = argv[2]; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector<string> regionFileNames; bool splitSubreads = true; bool useCCS = false; int minSubreadLength = 1; while (argi < argc) { if (strcmp(argv[argi], "-regionTable") == 0) { regionsFOFNName = argv[++argi]; } else if (strcmp(argv[argi], "-noSplitSubreads") == 0) { splitSubreads = false; } else if (strcmp(argv[argi], "-minSubreadLength") == 0) { minSubreadLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-useccsdenovo") == 0) { useCCS = true; } else { PrintUsage(); cout << "ERROR! Option " << argv[argi] << " is not supported." << endl; } argi++; } if (FileOfFileNames::IsFOFN(inputFileName)) { FileOfFileNames::FOFNToList(inputFileName, inputFileNames); } else { inputFileNames.push_back(inputFileName); } if (regionsFOFNName == "") { regionFileNames = inputFileNames; } else { if (FileOfFileNames::IsFOFN(regionsFOFNName)) { FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames); } else { regionFileNames.push_back(regionsFOFNName); } } ofstream fastaOut; CrucialOpen(outputFileName, fastaOut); int plsFileIndex; HDFRegionTableReader hdfRegionReader; AfgBasWriter afgWriter; afgWriter.Initialize(outputFileName); for (plsFileIndex = 0; plsFileIndex < inputFileNames.size(); plsFileIndex++) { if (splitSubreads) { hdfRegionReader.Initialize(regionFileNames[plsFileIndex]); hdfRegionReader.ReadTable(regionTable); regionTable.SortTableByHoleNumber(); } ReaderAgglomerate reader; // reader.SkipReadQuality(); // should have been taken care of by *Filter modules if (useCCS){ reader.UseCCS(); } else { reader.IgnoreCCS(); } reader.Initialize(inputFileNames[plsFileIndex]); CCSSequence seq; int seqIndex = 0; int numRecords = 0; vector<ReadInterval> subreadIntervals; while (reader.GetNext(seq)){ ++seqIndex; if (splitSubreads == false) { if (seq.length >= minSubreadLength) { afgWriter.Write(seq); } seq.Free(); continue; } DNALength hqReadStart, hqReadEnd; int score; GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, score); subreadIntervals.clear(); // clear old, new intervals are appended. CollectSubreadIntervals(seq,®ionTable, subreadIntervals); if (seq.length == 0 and subreadIntervals.size() > 0) { cout << "WARNING! A high quality interval region exists for a read of length 0." <<endl; cout << " The offending ZMW number is " << seq.zmwData.holeNumber << endl; seq.Free(); continue; } for (int intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { SMRTSequence subreadSequence; int subreadStart = subreadIntervals[intvIndex].start > hqReadStart ? subreadIntervals[intvIndex].start : hqReadStart; int subreadEnd = subreadIntervals[intvIndex].end < hqReadEnd ? subreadIntervals[intvIndex].end : hqReadEnd; int subreadLength = subreadEnd - subreadStart; if (subreadLength < minSubreadLength) continue; subreadSequence.subreadStart = subreadStart; subreadSequence.subreadEnd = subreadEnd; subreadSequence.ReferenceSubstring(seq, subreadStart, subreadLength); stringstream titleStream; titleStream << seq.title << "/" << subreadIntervals[intvIndex].start << "_" << subreadIntervals[intvIndex].end; subreadSequence.CopyTitle(titleStream.str()); afgWriter.Write(subreadSequence); delete[] subreadSequence.title; } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } }