int main(int argc, char* argv[]) { std::string seqInName, seqOutName, dotOutName; if (argc < 4) { std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl; std::exit(EXIT_FAILURE); } seqInName = argv[1]; dotOutName = argv[2]; seqOutName = argv[3]; FASTAReader reader; reader.Initialize(seqInName); FASTASequence origSeq; reader.GetNext(origSeq); std::ifstream dotOutFile; CrucialOpen(dotOutName, dotOutFile); std::ofstream seqOutFile; std::ofstream seqOut; CrucialOpen(seqOutName, seqOut, std::ios::out); std::string dotOutLine; getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); while (getline(dotOutFile, dotOutLine)) { std::stringstream lineStrm(dotOutLine); int swScore; float pctDiv, pctDel, pctIns; std::string query; DNALength qPosBegin, qPosEnd; std::string left; char strand; std::string matchingRepeat; std::string repClass; std::string repPos, repEnd, repLeft; int id; lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >> left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id; for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) { origSeq.seq[seqPos] = 'X'; } } DNALength seqPos, unexPos; unexPos = 0; for (seqPos = 0; seqPos < origSeq.length; seqPos++) { if (origSeq.seq[seqPos] != 'X') { origSeq.seq[unexPos] = origSeq.seq[seqPos]; unexPos++; } } origSeq.length = unexPos; origSeq.PrintSeq(seqOut); return 0; }
int main(int argc, char* argv[]) { string alignmentFileName, geneDBFileName; if (argc < 3) { cout << "usage: printDuplicates alignemntFile genedb " << endl; exit(1); } alignmentFileName = argv[1]; geneDBFileName = argv[2]; GeneDB genedb; genedb.Read(geneDBFileName); genedb.IndexChromosomes(); ifstream alignmentIn; CrucialOpen(alignmentFileName, alignmentIn, std::ios::in); while(alignmentIn) { string line; getline(alignmentIn, line); if (line == "###") { // found the end of an entry } else { string chrName, genome, source; int start, end, identity; char a, strand, b; string annotationString; stringstream strm(line); strm >> chrName >> genome >> source >> start >> end >> a >> strand >> b >> annotationString; vector<string> annotations; if (source != "exon") { continue; } ParseSeparatedList(annotationString, annotations, ';'); GeneDBChromosome *chromosomePtr; chromosomePtr = genedb.Find(chrName); if (chromosomePtr == NULL) { cout << "chromosome " << chromosomePtr << " not found in database." << endl; continue; } int exonIndex; if ( chromosomePtr->LookupIndexByStart(start, exonIndex) ) { if (chromosomePtr->exons[exonIndex].start <= start and chromosomePtr->exons[exonIndex].end >= end) { chromosomePtr->exons[exonIndex].Print(cout); } } else { } } } }
bool SAMReader<T_ReferenceSequence, T_ReadGroup, T_SAMAlignment>::Initialize(std::string samFileName) { if(samFileName != "stdin") { CrucialOpen(samFileName, samFile, std::ios::in); samFilePtr = &samFile; } else { samFilePtr = &std::cin; } return true; }
void FileOfFileNames::FOFNToList(std::string &fofnFileName, std::vector<std::string> &fofnList) { std::ifstream fofnIn; CrucialOpen(fofnFileName, fofnIn); while (fofnIn) { std::string name; std::getline(fofnIn, name); if (name.size() > 0) { fofnList.push_back(name); } } }
int main(int argc, char *argv[]) { string sequencesInName, sequencesOutName; if (argc <3){ cout << "usage: scramble in out" << endl; exit(1); } sequencesInName = argv[1]; sequencesOutName= argv[2]; vector<FASTASequence*> sequences; vector<int> sequenceIndices; FASTAReader reader; reader.Init(sequencesInName); ofstream out; CrucialOpen(sequencesOutName, out, std::ios::out); FASTASequence read; FASTASequence*readPtr; while(reader.GetNext(read)) { readPtr = new FASTASequence; *readPtr = read; sequences.push_back(readPtr); } int i; for (i = 0; i < sequences.size(); i++) { sequenceIndices.push_back(i); } for (i = 0; i < 10*sequences.size(); i++ ){ // // shuffle indices. // int idx1; int idx2; idx1 = RandomInt(sequences.size()); idx2 = RandomInt(sequences.size()); int tmp; tmp = sequenceIndices[idx1]; sequenceIndices[idx1] = sequenceIndices[idx2]; sequenceIndices[idx2] = tmp; } for (i = 0; i < sequenceIndices.size(); i++ ){ sequences[sequenceIndices[i]]->PrintSeq(out); } return 0; }
void CompressedSequence<T_Sequence>::Write(std::string outFileName) { std::ofstream out; CrucialOpen(outFileName,out, std::ios::binary | std::ios::in); out.write((char*) &hasTitle, sizeof(int)); out.write((char*) &hasIndex, sizeof(int)); if (hasTitle) { out.write((char*)&titleLength, sizeof(int)); out.write((char*)title, titleLength); } out.write((char*) &length, sizeof(int)); out.write((char*) seq, sizeof(char) * length); if (hasIndex) { index.Write(out); } out.close(); }
int Read(std::string inName) { std::ifstream bwtIn; CrucialOpen(inName, bwtIn, std::ios::binary|std::ios::in); bwtSequence.Read(bwtIn);*)charCount, sizeof(DNALength)*CharCountSize);*)&firstCharPos, sizeof(DNALength));*)&useDebugData, sizeof(useDebugData)); if (useDebugData) { saCopy.resize(bwtSequence.length-1);*)&saCopy[0], (bwtSequence.length-1) * sizeof(DNALength)); } occ.Read(bwtIn, useDebugData); pos.Read(bwtIn); occ.InitializeBWT(bwtSequence); return 1; }
void GFFFile::ReadAll(std::string & gffFileName) { std::fstream gffIn; CrucialOpen(gffFileName, gffIn, std::ios::in); while(gffIn) { std::string line; getline(gffIn, line); std::stringstream linestrm(line); std::string name, source, type; UInt start, end; char strand; float score; std::string frame, attributes; // A sample record in adapterGffFile: // ref000001 . adapter 10955 10999 0.00 + . xxxx linestrm >> name >> source >> type >> start >> end >> score >> strand >> frame >> attributes; entries.push_back(GFFEntry( name, source, type, start, end, score, strand, frame, attributes)); } gffIn.close(); }
void CompressedSequence<T_Sequence>::Read(std::string inFileName) { Free(); //Free before reusing this object. std::ifstream in; CrucialOpen(inFileName, in, std::ios::binary | std::ios::in); // step 1, read in the options.*) &hasTitle, sizeof(int));*) &hasIndex, sizeof(int)); if (hasTitle) { int inTitleLength;*) &inTitleLength, sizeof(int)); char * inTitle = ProtectedNew<char>(inTitleLength+1);*) inTitle, inTitleLength); inTitle[titleLength] = '\0'; CopyTitle(inTitle, inTitleLength); delete [] inTitle; }*) &length, sizeof(DNALength)); seq = ProtectedNew<Nucleotide>(length);*) seq, length * sizeof(Nucleotide)); if (hasIndex) { index.Read(in); } deleteOnExit = true; }
int main(int argc, char* argv[]) { CommandLineParser clp; string indexDBName; bool printIndex = false; int searchIndex; // // Configure the command line. // clp.SetProgramName("testseqdb"); clp.SetProgramSummary("test the sequence db.\n"); clp.RegisterStringOption("indexdb", &indexDBName, "The index to test."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("p", &printIndex, "Print the start position of each read."); clp.RegisterIntOption("i", &searchIndex, "The index to search for", CommandLineParser::NonNegativeInteger, true); clp.ParseCommandLine(argc, argv); SequenceIndexDatabase<FASTASequence> seqDB; ifstream in; CrucialOpen(indexDBName, in, std::ios::in | std::ios::binary); seqDB.ReadDatabase(in); if (printIndex) { int i; for (i = 0; i < seqDB.nSeqPos - 1; i++) { cout << i << " " << seqDB.seqStartPos[i+1] << " " << seqDB.names[i] << endl; } } int dbPos = seqDB.SearchForIndex(searchIndex); if (dbPos >= 0) { cout << "searchIndex: " << searchIndex << " " << dbPos << " " << seqDB.seqStartPos[dbPos] << " " << seqDB.names[dbPos-1] << endl; } };
int main(int argc, char* argv[]) { std::string outFileName; unsigned contextLength = 5; int minSamples = 500; int maxSamples = 1000; if (argc < 3) { PrintUsage(); std::exit(EXIT_FAILURE); } int argi = 1; std::string cmpH5FileName; cmpH5FileName = argv[argi++]; outFileName = argv[argi++]; int minAverageQual = 0; bool onlyMaxLength = false; while (argi < argc) { if (strcmp(argv[argi], "-contextLength") == 0) { contextLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-minSamples") == 0) { minSamples = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-maxSamples") == 0) { maxSamples = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-onlyMaxLength") == 0) { onlyMaxLength = true; } else { PrintUsage(); std::cout << "ERROR, bad option: " << argv[argi] << std::endl; std::exit(EXIT_FAILURE); } ++argi; } std::map<std::string, ScoredLength> maxLengthMap; OutputSampleListSet samples(contextLength); SMRTSequence read; std::ofstream sampleOut; CrucialOpen(outFileName, sampleOut, std::ios::out | std::ios::binary); int fileNameIndex; int numContextsReached = 0; int numContexts = 1 << (contextLength * 2); ReaderAgglomerate reader; samples.keyLength = contextLength; HDFCmpFile<CmpAlignment> cmpReader; cmpReader.IncludeField("QualityValue"); cmpReader.IncludeField("DeletionQV"); cmpReader.IncludeField("InsertionQV"); cmpReader.IncludeField("SubstitutionQV"); cmpReader.IncludeField("SubstitutionTag"); cmpReader.IncludeField("DeletionTag"); cmpReader.IncludeField("PulseIndex"); cmpReader.IncludeField("WidthInFrames"); cmpReader.IncludeField("PreBaseFrames"); if (cmpReader.Initialize(cmpH5FileName, H5F_ACC_RDWR) == 0) { std::cout << "ERROR, could not open the cmp file." << std::endl; std::exit(EXIT_FAILURE); } std::cout << "Reading cmp file." << std::endl; CmpFile cmpFile; cmpReader.ReadAlignmentDescriptions(cmpFile); cmpReader.ReadStructure(cmpFile); std::cout << "done reading structure." << std::endl; int alignmentIndex; int nAlignments = cmpReader.alnInfoGroup.GetNAlignments(); std::vector<int> alignmentToBaseMap; for (alignmentIndex = 0; alignmentIndex < nAlignments and !samples.Sufficient(); alignmentIndex++) { // // For ease of use, store the length of the alignment to make another model. // ByteAlignment alignmentArray; cmpReader.ReadAlignmentArray(alignmentIndex, alignmentArray); Alignment alignment; ByteAlignmentToAlignment(alignmentArray, alignment); std::string readSequence, refSequence; readSequence.resize(alignmentArray.size()); refSequence.resize(alignmentArray.size()); DNASequence readDNA, refDNA; ByteAlignmentToQueryString(&alignmentArray[0], alignmentArray.size(), &readSequence[0]); ByteAlignmentToRefString(&alignmentArray[0], alignmentArray.size(), &refSequence[0]); RemoveGaps(readSequence, readSequence); RemoveGaps(refSequence, refSequence); readDNA.seq = (Nucleotide*)readSequence.c_str(); readDNA.length = readSequence.size(); refDNA.seq = (Nucleotide*)refSequence.c_str(); refDNA.length = refSequence.size(); CmpAlignment cmpAlignment; cmpReader.ImportReadFromCmpH5(alignmentIndex, cmpAlignment, read); CreateAlignmentToSequenceMap(alignmentArray, alignmentToBaseMap); if (read.length < contextLength) { continue; } int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() - cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart()); if (onlyMaxLength == false) { samples.lengths.push_back(subreadLength); } else { int score = (cmpAlignment.GetNMatch() - cmpAlignment.GetNMismatch() - cmpAlignment.GetNInsertions() - cmpAlignment.GetNDeletions()); std::stringstream nameStrm; nameStrm << cmpAlignment.GetMovieId() << "_" << cmpAlignment.GetHoleNumber(); std::string nameStr = nameStrm.str(); if (maxLengthMap.find(nameStr) == maxLengthMap.end()) { maxLengthMap[nameStr] = ScoredLength(score, subreadLength); } } int sampleEnd = alignmentArray.size() - contextLength / 2; int a; for (a = contextLength / 2; a < sampleEnd; a++) { // Make sure the context begins on a real nucleotide. while (a < sampleEnd and ((RefChar[alignmentArray[a]] == ' '))) { a++; } // // Move ab back to an index where there are contextLength/2 non-gap // characters, counted by nb // int ab; //num bases int ae; //alignment end ab = a - 1; int nb = 0, ne = 0; while (true) { if (RefChar[alignmentArray[ab]] != ' ') { nb++; } if (ab == 0 or nb == static_cast<int>(contextLength) / 2) break; ab--; } // // Advance ae to an index where there are contextLength/2 non-gap // characters, counted by ne. // ae = a + 1; while (ae < static_cast<int>(alignmentArray.size()) and ne < static_cast<int>(contextLength) / 2) { if (RefChar[alignmentArray[ae]] != ' ') { ne++; } ae++; } // // Make sure there are no edge effects that prevent a context of the correct length from being assigned. // if (nb + ne + 1 != static_cast<int>(contextLength)) { continue; } int ai; std::string context; for (ai = ab; ai < ae; ai++) { if (RefChar[alignmentArray[ai]] != ' ') { context.push_back(RefChar[alignmentArray[ai]]); } } assert(context.size() == contextLength); // // Now create the context. // OutputSample sample; // // This context is a deletion, create that. // sample.type = OutputSample::Deletion; // // This context is either an insertion or substitution // // Look to see if the previous aligned position was an // insertion, and move back as far as the insertion extends. int aq = a - 1; int sampleLength; if (QueryChar[alignmentArray[a]] == ' ') { sample.type = OutputSample::Deletion; sampleLength = 0; } else if (RefChar[alignmentArray[aq]] == ' ') { while (aq > 0 and RefChar[alignmentArray[aq]] == ' ' and QueryChar[alignmentArray[aq]] != ' ') { aq--; } sample.type = OutputSample::Insertion; sampleLength = a - aq; } else if (QueryChar[alignmentArray[a]] == RefChar[alignmentArray[aq]]) { sample.type = OutputSample::Match; sampleLength = 1; } else { sample.type = OutputSample::Substitution; sampleLength = 1; } sample.Resize(sampleLength); if (sampleLength > 0) { int seqPos = alignmentToBaseMap[aq]; if (seqPos < static_cast<int>(read.length)) { sample.CopyFromSeq(read, seqPos, sampleLength); std::string nucs; for (size_t n = 0; n < sample.nucleotides.size(); n++) { char c = sample.nucleotides[n]; assert(c == 'A' or c == 'T' or c == 'G' or c == 'C'); nucs.push_back(sample.nucleotides[n]); } } } samples.AppendOutputSample(context, sample); } read.Free(); } if (onlyMaxLength) { std::map<std::string, ScoredLength>::iterator maxScoreIt; for (maxScoreIt = maxLengthMap.begin(); maxScoreIt != maxLengthMap.end(); ++maxScoreIt) { std::cout << maxScoreIt->second.length << std::endl; samples.lengths.push_back(maxScoreIt->second.length); } } samples.Write(sampleOut); return 0; }
void Write(std::string outName) { std::ofstream bwtOut; CrucialOpen(outName, bwtOut, std::ios::binary|std::ios::out); Write(bwtOut); }
int main(int argc, char* argv[]) { string program = "samtom4"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, refFileName, outFileName; bool printHeader = false; bool parseSmrtTitle = false; bool useShortRefName = false; CommandLineParser clp; clp.SetProgramName(program); clp.SetVersion(versionString); clp.SetProgramSummary("Converts a SAM file generated by blasr to M4 format."); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file, which is produced by blasr."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate file.sam."); clp.RegisterStringOption("out.m4", &outFileName, "Output in blasr M4 format."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("header", &printHeader, "Print M4 header."); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); //clp.SetExamples(program + " file.sam reference.fasta out.m4"); clp.ParseCommandLine(argc, argv); ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // // Map short names for references obtained from file.sam to // full names obtained from reference.fasta // map<string, string> shortRefNameToFull; map<string, string>::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (size_t i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // Map reference name obtained from SAM file to indices map<string, int> refNameToIndex; for (size_t i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; size_t alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // if (printHeader) IntervalOutput::PrintHeader(*outFilePtr); // The socre matrix does not matter because we will use the // aligner's score from SAM file anyway. DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } // The padding character 'P' is not supported if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process sam record with 'P' in its cigar string." << endl; continue; } vector<AlignmentCandidate<> > convertedAlignments; // // Keep reference as forward. // So if IsReverseComplement(sam.flag)==true, then qStrand is reverse // and tStrand is forward. // bool keepRefAsForward = false; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, keepRefAsForward); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore an alignment which has multiple segments." << endl; continue; } //all alignments are unique single-ended alignments. for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distScoreFn); // Use aligner's score from SAM file anyway. alignment.score =; alignment.mapQV = samAlignment.mapQV; // Since SAM only has the aligned sequence, many info of the // original query (e.g. the full length) is missing. // Overwrite alignment.qLength (which is length of the query // in the SAM alignment) with xq (which is the length of the // original query sequence saved by blasr) right before printing // the output so that one can reconstruct a blasr m4 record from // a blasr sam alignment. if (samAlignment.xq!=0) alignment.qLength = samAlignment.xq; IntervalOutput::PrintFromSAM(alignment, *outFilePtr); alignment.FreeSubsequences(); } ++alignIndex; } if (outFileName != "") { outFileStrm.close(); } return 0; }
int main(int argc, char* argv[]) { std::string inFileName, readsFileName; DNALength readLength; float coverage = 0; bool noRandInit = false; int numReads = -1; CommandLineParser clp; int qualityValue = 20; bool printFastq = false; int stratify = 0; std::string titleType = "pacbio"; std::string fastqType = "illumina"; // or "sanger" clp.RegisterStringOption("inFile", &inFileName, "Reference sequence", 0); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterIntOption("readLength", (int*)&readLength, "The length of reads to simulate. The length is fixed.", CommandLineParser::PositiveInteger, 0); clp.RegisterFloatOption("coverage", &coverage, "Total coverage (from which the number of reads is calculated", CommandLineParser::PositiveFloat, 0); clp.RegisterFlagOption("nonRandInit", &noRandInit, "Skip initializing the random number generator with time."); clp.RegisterIntOption("nReads", &numReads, "Total number of reads (from which coverage is calculated)", CommandLineParser::PositiveInteger, 0); clp.RegisterStringOption("readsFile", &readsFileName, "Reads output file", 0); clp.RegisterFlagOption("fastq", &printFastq, "Fake fastq output with constant quality value (20)"); clp.RegisterIntOption("quality", &qualityValue, "Value to use for fastq quality", CommandLineParser::PositiveInteger); clp.RegisterIntOption("stratify", &stratify, "Sample a read every 'stratify' bases, rather than randomly.", CommandLineParser::PositiveInteger); clp.RegisterStringOption("titleType", &titleType, "Set the name of the title: 'pacbio'|'illumina'"); clp.RegisterStringOption("fastqType", &fastqType, "Set the type of fastq: 'illumina'|'sanger'"); std::vector<std::string> leftovers; clp.ParseCommandLine(argc, argv, leftovers); if (!noRandInit) { InitializeRandomGeneratorWithTime(); } FASTAReader inReader; inReader.Init(inFileName); std::vector<FASTASequence> reference; inReader.ReadAllSequences(reference); std::ofstream readsFile; if (readsFileName == "") { std::cout << "ERROR. You must specify a reads file." << std::endl; std::exit(EXIT_FAILURE); } CrucialOpen(readsFileName, readsFile, std::ios::out); std::ofstream sangerFastqFile; if (fastqType == "sanger") { std::string sangerFastqFileName = readsFileName + ".fastq"; CrucialOpen(sangerFastqFileName, sangerFastqFile, std::ios::out); } DNALength refLength = 0; for (size_t i = 0; i < reference.size(); i++) { refLength += reference[i].length; } if (numReads == -1 and coverage == 0 and stratify == 0) { std::cout << "ERROR, you must specify either coverage, nReads, or stratify." << std::endl; std::exit(EXIT_FAILURE); } else if (numReads == -1) { numReads = (refLength / readLength) * coverage; } if (stratify) { if (!readLength) { std::cout << "ERROR. If you are using stratification, a read length must be specified." << std::endl; std::exit(EXIT_FAILURE); } } DNASequence sampleSeq; sampleSeq.length = readLength; int maxRetry = 10000000; int retryNumber = 0; DNALength seqIndex, seqPos; if (stratify) { seqIndex = 0; seqPos = 0; } DNALength origReadLength = readLength; for (int i = 0; stratify or i < numReads; i++) { if (stratify == 0) { FindRandomPos(reference, seqIndex, seqPos, readLength); } else { // // find the next start pos, or bail if done // if (seqPos >= reference[seqIndex].length) { if (seqIndex == reference.size() - 1) { break; } else { seqIndex = seqIndex + 1; seqPos = 0; continue; } } readLength = std::min(reference[seqIndex].length - seqPos, origReadLength); } sampleSeq.seq = &reference[seqIndex].seq[seqPos]; int j; int gappedRead = 0; std::string title; std::stringstream titleStrm; if (titleType == "pacbio") { titleStrm << i << "|" << reference[seqIndex].GetName() << "|" << seqPos << "|" << seqPos + readLength; } else if (titleType == "illumina") { titleStrm << "SE_" << i << "_0@" << seqPos << "-" << seqPos + readLength << "/1"; } else { std::cout << "ERROR. Bad title type " << titleType << std::endl; std::exit(EXIT_FAILURE); } title = titleStrm.str(); sampleSeq.length = readLength; if (!printFastq) { readsFile << ">" << title << std::endl; sampleSeq.PrintSeq(readsFile); } else { FASTQSequence fastqSampleSeq; fastqSampleSeq.CopyTitle(title); fastqSampleSeq.seq = sampleSeq.seq; fastqSampleSeq.length = sampleSeq.length; = new unsigned char[sampleSeq.length]; std::fill(, + sampleSeq.length, qualityValue); if (fastqType == "illumina") { fastqSampleSeq.PrintFastq(readsFile, fastqSampleSeq.length + 1); } else { fastqSampleSeq.PrintSeq(readsFile); fastqSampleSeq.PrintQual(sangerFastqFile); } delete[]; delete[] fastqSampleSeq.title; } if (stratify) { seqPos += readLength; } } return 0; }
void AfgBasWriter::Initialize(std::string _afgFileName){ afgFileName = _afgFileName; CrucialOpen(afgFileName, afgOut); }
int main(int argc, char* argv[]) { string barcodeFileName, insertFileName, outputFileName; if (argc != 4) { cout << "usage: makeBarcodeDatabase insert.fasta barcodes.fasta output.fasta" << endl; exit(1); } insertFileName = argv[1]; barcodeFileName = argv[2]; outputFileName = argv[3]; FASTAReader barcodeReader, insertReader; barcodeReader.Initialize(barcodeFileName); insertReader.Initialize(insertFileName); ofstream barcodedOut; CrucialOpen(outputFileName, barcodedOut, std::ios::out); vector<FASTASequence> forwardBarcodes, reverseBarcodes; FASTASequence barcodeSequence, reverseBarcodeSequence; while(barcodeReader.GetNext(barcodeSequence)) { forwardBarcodes.push_back(barcodeSequence); barcodeSequence.MakeRC(reverseBarcodeSequence); reverseBarcodes.push_back(reverseBarcodeSequence); } FASTASequence insert; insertReader.GetNext(insert); int i; for (i = 0; i < forwardBarcodes.size(); i++) { FASTASequence barcodedInsert; barcodedInsert.Resize(forwardBarcodes[i].length * 2 + insert.length); stringstream titleStrm; titleStrm << insert.title << "|ff|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|fr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rf|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); } }
int main(int argc, char* argv[]) { #ifdef USE_GOOGLE_PROFILER char *profileFileName = getenv("CPUPROFILE"); if (profileFileName != NULL) { ProfilerStart(profileFileName); } else { ProfilerStart("google_profile.txt"); } #endif // Register inputs and outputs. string samFileName, refFileName, outFileName; CommandLineParser clp; clp.RegisterStringOption("file.sam", &samFileName, "Input SAM file."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads."); clp.RegisterStringOption("out.sam", &outFileName, "Output SAM file."); clp.RegisterPreviousFlagsAsHidden(); // Register filter criteria options. int minAlnLength = 50; float minPctSimilarity = 70, minPctAccuracy = 70; string hitPolicyStr = "randombest"; bool useScoreCutoff = false; int scoreCutoff = INF_INT; int scoreSignInt = -1; RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, minPctAccuracy, hitPolicyStr, useScoreCutoff, scoreSignInt, scoreCutoff); int seed = 1; clp.RegisterIntOption("seed", &seed, "(1) Seed for random number generator.\n" "If seed is 0, then use current time as seed.", CommandLineParser::Integer); string holeNumberStr; Ranges holeNumberRanges; clp.RegisterStringOption("holeNumbers", &holeNumberStr, "A string of comma-delimited hole number ranges to output hits, " "such as '1,2,10-12'. " "This requires hit titles to be in SMRT read title format."); bool parseSmrtTitle = false; clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when filtering alignments generated by " "programs other than blasr, e.g. bwa-sw or gmap. " " Parse read coordinates from the SMRT read title. " "The title is in the format /name/hole/coordinates, where" " coordinates are in the format \\d+_\\d+, and represent " "the interval of the read that was aligned."); /* This experimental option can be useful for metagenomics, in which case * there are hundreds of sequences in the target, of which many titles are * long and may contain white spaces (e.g., ' ', '\t'). * In order to save disc space and avoid the (possibly) none unique mapping * between full and short reference names, one may call blasr with * -titleTable option to represent all target sequences in the output * by their indices in the title table.*/ string titleTableName = ""; clp.RegisterStringOption("titleTable", &titleTableName, "Use this experimental option when filtering alignments generated by " "blasr with -titleTable titleTableName, in which case " "reference titles in SAM are represented by their " "indices (e.g., 0, 1, 2, ...) in the title table."); string adapterGffFileName = ""; clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName, "Use this option to remove reads which can only map to adapters " "specified in the GFF file."); bool verbose = false; clp.RegisterFlagOption("v", &verbose, "Be verbose."); clp.SetExamples( "Because SAM has optional tags that have different meanings" " in different programs, careful usage is required in order " "to have proper output. The \"xs\" tag in bwa-sw is used to " "show the suboptimal score, but in PacBio SAM (blasr) it is " "defined as the start in the query sequence of the alignment.\n" "When \"-smrtTitle\" is specified, the xs tag is ignored, but " "when it is not specified, the coordinates given by the xs and " "xe tags are used to define the interval of a read that is " "aligned. The CIGAR string is relative to this interval."); clp.ParseCommandLine(argc, argv); // Set random number seed. if (seed == 0) { srand(time(NULL)); } else { srand(seed); } scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE; Score s(static_cast<float>(scoreCutoff), scoreSign); FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, minPctAccuracy, true, s); filterCriteria.Verbose(verbose); HitPolicy hitPolicy(hitPolicyStr, scoreSign); string errMsg; if (not filterCriteria.MakeSane(errMsg)) { cout << errMsg << endl; exit(1); } // Parse hole number ranges. if (holeNumberStr.size() != 0) { if (not holeNumberRanges.setRanges(holeNumberStr)) { cout << "Could not parse hole number ranges: " << holeNumberStr << "." << endl; exit(1); } } // Open output file. ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } GFFFile adapterGffFile; if (adapterGffFileName != "") adapterGffFile.ReadAll(adapterGffFileName); SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Filter sam hits."; string program = "samFilter"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); // If the SAM file is generated by blasr with -titleTable, // then references in the SAM are represented by // their corresponding indices in the title table. // In that case, we need to convert reference titles in fasta file // to their corresponding indices in the title table, such that // references in both SAM and fasta files are represented // by title table indices and therefore can match. if (titleTableName != "") { ConvertTitlesToTitleTableIndices(references, titleTableName); } AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet; vector<string> allHeaders = samReader.ReadHeader(alignmentSet); // Process SAM Header. string commandLineString; clp.CommandLineToString(argc, argv, commandLineString); allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \ "\tCL:" + program + " " + commandLineString); for (int i = 0; i < allHeaders.size(); i++) { outFileStrm << allHeaders[i] << endl; } // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that they are ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // Map reference name obtained from SAM file to indices map<string, int> refNameToIndex; for (int i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; int alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // vector<SAMAlignment> allSAMAlignments; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (parseSmrtTitle and holeNumberStr.size() != 0) { string movieName; int thisHoleNumber; if (not ParsePBIReadName(samAlignment.qName, movieName, thisHoleNumber)) { cout << "ERROR, could not parse SMRT title: " << samAlignment.qName << "." << endl; exit(1); } if (not holeNumberRanges.contains(UInt(thisHoleNumber))) { if (verbose) cout << thisHoleNumber << " is not in range." << endl; continue; } } if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process SAM record with 'P' in " << "its cigar string." << endl; continue; } vector<AlignmentCandidate<> > convertedAlignments; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, false); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore multiple segments." << endl; continue; } for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; //score func does not matter DistanceMatrixScoreFunction<DNASequence, DNASequence> distFunc; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distFunc); // Check whether this alignment can only map to adapters in // the adapter GFF file. if (adapterGffFileName != "" and CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) { if (verbose) cout << alignment.qName << " filter adapter only." << endl; continue; } // Assign score to samAlignment. samAlignment.score =; if (not filterCriteria.Satisfy(static_cast<AlignmentCandidate<> *>(&alignment))) { continue; } allSAMAlignments.push_back( samAlignment ); alignment.FreeSubsequences(); } ++alignIndex; } // Sort all SAM alignments by qName, score and target position. sort(allSAMAlignments.begin(), allSAMAlignments.end(), byQNameScoreTStart); unsigned int groupBegin = 0; unsigned int groupEnd = -1; vector<SAMAlignment> filteredSAMAlignments; while(groupBegin < allSAMAlignments.size()) { // Get the next group of SAM alignments which have the same qName // from allSAMAlignments[groupBegin ... groupEnd) GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd); vector<unsigned int> hitIndices = ApplyHitPolicy( hitPolicy, allSAMAlignments, groupBegin, groupEnd); for(unsigned int i = 0; i < hitIndices.size(); i++) { filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]); } groupBegin = groupEnd; } // Sort all SAM alignments by reference name and query name sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), byRNameQName); for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) { filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm); } if (outFileName != "") { outFileStrm.close(); } #ifdef USE_GOOGLE_PROFILER ProfilerStop(); #endif return 0; }