int main(int argc, char* argv[]) { string refFileName, queryFileName; int maxHammingDistance; if (argc < 4) { cout << "usage: hammer ref query maxHam " << endl; exit(1); } refFileName = argv[1]; queryFileName = argv[2]; maxHammingDistance = atoi(argv[3]); FASTAReader reader; reader.Initialize(refFileName); FASTASequence ref, refRC; reader.GetNext(ref); ref.MakeRC(refRC); FASTAReader queryReader; queryReader.Initialize(queryFileName); FASTASequence query; queryReader.GetNext(query); DNALength p; for(p=0; p < ref.length-query.length-1; p++ ){ DNASequence subseq; subseq.seq = &ref.seq[p]; subseq.length = query.length; // cout << "t "; subseq.PrintSeq(cout); // cout << "q "; ((DNASequence*)&query)->PrintSeq(cout); if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) { cout << ">" << p << endl; subseq.PrintSeq(cout); } int i; for (i =0; i < query.length; i++) { subseq.seq[i] = toupper(subseq.seq[i]); } } for(p=0; p < ref.length-query.length-1; p++ ){ DNASequence subseq; subseq.seq = &refRC.seq[p]; subseq.length = query.length; if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) { cout << ">" << p << "rc" << endl; subseq.PrintSeq(cout); } int i; for (i =0; i < query.length; i++) { subseq.seq[i] = toupper(subseq.seq[i]); } } }
int main(int argc, char* argv[]) { string genomeFileName, subseqFileName; if (argc != 3) { cout << "usage: extractRepeats genome repeat" << endl; exit(0); } genomeFileName = argv[1]; subseqFileName = argv[2]; FASTASequence genome, sub; FASTAReader reader; reader.Init(genomeFileName); reader.GetNext(genome); reader.Init(subseqFileName); reader.GetNext(sub); genome.ToUpper(); sub.ToUpper(); DNALength genomePos; FASTASequence genomeSub; int kband = (int) (0.15) * sub.length; vector<int> scoreMat; vector<Arrow> pathMat; int readIndex = 0; cout << "starting extraction" << endl; for (genomePos = 0; genomePos < genome.length - sub.length + 1; genomePos++) { genomeSub.seq = &genome.seq[genomePos]; genomeSub.length = sub.length; int alignScore; Alignment alignment; alignScore = SWAlign(genomeSub, sub, EditDistanceMatrix, 1, //1,kband, scoreMat, pathMat, alignment, QueryFit); if (alignScore < 0.25 * sub.length) { stringstream titlestrm; titlestrm << readIndex << "|" << genomePos << "|" << genomePos + sub.length << " " << alignScore/ (1.0*sub.length); FASTASequence subcopy; subcopy.CopyTitle(titlestrm.str()); subcopy.seq = &genome.seq[genomePos]; subcopy.length = sub.length; subcopy.PrintSeq(std::cout); genomePos += sub.length; } } }
int main(int argc, char* argv[]) { if (argc < 3) { cout << "usage: testBuildOccBins genomeFileName suffixArray" << endl; exit(0); } string genomeFileName = argv[1]; string suffixArrayFileName = argv[2]; FASTAReader reader; reader.Init(genomeFileName); FASTASequence seq; reader.GetNext(seq); DNASuffixArray suffixArray; suffixArray.Read(suffixArrayFileName); Bwt<PackedDNASequence, FASTASequence> bwt; //bwt.InitializeFromSuffixArray(seq, suffixArray.index); bwt.InitializeBWTStringFromSuffixArray(seq, suffixArray.index); bwt.occ.Initialize(bwt.bwtSequence, 4096, 64); bwt.occ.PrintBins(cout); }
int main(int argc, char* argv[]) { string seqFileName; TupleMetrics tm; string outFileName; if (argc < 3) { cout << "usage: storeTuplePosList seqFile tupleSize outFile" << endl; return 0; } seqFileName = argv[1]; tm.tupleSize = atoi(argv[2]); outFileName = argv[3]; ofstream outFile; // CrucialOpen(outFileName, outFile, std::ios::out| std::ios::binary); FASTAReader reader; reader.Init(seqFileName); FASTASequence seq; reader.GetNext(seq); // vector<PositionDNATuple> TupleList<PositionDNATuple>tuplePosList; tuplePosList.SetTupleMetrics(tm); // StoreTuplePosList(seq, tm, tuplePosList); SequenceToTupleList(seq, tm, tuplePosList); tuplePosList.Sort(); tuplePosList.WriteToFile(outFileName); //WriteTuplePosList(tuplePosList, tm.tupleSize, outFile); outFile.close(); return 0; }
int main(int argc, char* argv[]) { string inFileName, outFileName; int length; inFileName = argv[1]; outFileName = argv[2]; length = atoi(argv[3]); int argi = 4; int stride = 0; float coverage = 0; while (argi < argc) { if (strcmp(argv[argi], "-stride")) { stride = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-coverage")) { coverage = atof(argv[++argi]); } ++argi; } FASTAReader reader; reader.Initialize(inFileName); FASTASequence genome; reader.GetNext(genome); if (stride == 0 and coverage == 0) { cout << "ERROR, must provide stride or coverage. " << endl; exit(1); } if (stride == 0) { stride = genome.length * coverage / length; }
int main(int argc, char* argv[]) { if (argc < 3) { cout << "usage: bwtLocateList bwtName querySeqFile" << endl; exit(1); } string bwtFileName = argv[1]; string querySeqFileName = argv[2]; bool doPrintResults = false; int maxCount = 0; int argi = 3; bool countOnly = false; while(argi < argc) { if (strcmp(argv[argi], "-print") == 0) { doPrintResults = true; } else if (strcmp(argv[argi], "-max") == 0) { maxCount = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-count") == 0) { countOnly = true; } else { cout << "bad option: " << argv[argi] << endl; } ++argi; } Bwt<PackedDNASequence, FASTASequence> bwt; bwt.Read(bwtFileName); FASTAReader queryReader; queryReader.Init(querySeqFileName); FASTASequence seq; int seqIndex = 0; vector<DNALength> positions; while(queryReader.GetNext(seq)) { positions.clear(); if (countOnly == false) { bwt.Locate(seq, positions, maxCount); } else { DNALength sp,ep; bwt.Count(seq, sp, ep); } // cout << "matched " << positions.size() << " positions." << endl; if (doPrintResults) { int i; for (i = 0; i < positions.size(); i++ ){ cout << positions[i] << " "; } cout << endl; } ++seqIndex; } // float wordCountsPerLookup = (bwt.bwtSequence.nCountInWord *1.0) / bwt.bwtSequence.nCountNuc; // cout << "word counts per lookup: " << wordCountsPerLookup << endl; return 0; }
int main(int argc, char* argv[]) { std::string seqInName, seqOutName, dotOutName; if (argc < 4) { std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl; std::exit(EXIT_FAILURE); } seqInName = argv[1]; dotOutName = argv[2]; seqOutName = argv[3]; FASTAReader reader; reader.Initialize(seqInName); FASTASequence origSeq; reader.GetNext(origSeq); std::ifstream dotOutFile; CrucialOpen(dotOutName, dotOutFile); std::ofstream seqOutFile; std::ofstream seqOut; CrucialOpen(seqOutName, seqOut, std::ios::out); std::string dotOutLine; getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); while (getline(dotOutFile, dotOutLine)) { std::stringstream lineStrm(dotOutLine); int swScore; float pctDiv, pctDel, pctIns; std::string query; DNALength qPosBegin, qPosEnd; std::string left; char strand; std::string matchingRepeat; std::string repClass; std::string repPos, repEnd, repLeft; int id; lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >> left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id; for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) { origSeq.seq[seqPos] = 'X'; } } DNALength seqPos, unexPos; unexPos = 0; for (seqPos = 0; seqPos < origSeq.length; seqPos++) { if (origSeq.seq[seqPos] != 'X') { origSeq.seq[unexPos] = origSeq.seq[seqPos]; unexPos++; } } origSeq.length = unexPos; origSeq.PrintSeq(seqOut); return 0; }
int main(int argc, char *argv[]) { string sequencesInName, sequencesOutName; if (argc <3){ cout << "usage: scramble in out" << endl; exit(1); } sequencesInName = argv[1]; sequencesOutName= argv[2]; vector<FASTASequence*> sequences; vector<int> sequenceIndices; FASTAReader reader; reader.Init(sequencesInName); ofstream out; CrucialOpen(sequencesOutName, out, std::ios::out); FASTASequence read; FASTASequence*readPtr; while(reader.GetNext(read)) { readPtr = new FASTASequence; *readPtr = read; sequences.push_back(readPtr); } int i; for (i = 0; i < sequences.size(); i++) { sequenceIndices.push_back(i); } for (i = 0; i < 10*sequences.size(); i++ ){ // // shuffle indices. // int idx1; int idx2; idx1 = RandomInt(sequences.size()); idx2 = RandomInt(sequences.size()); int tmp; tmp = sequenceIndices[idx1]; sequenceIndices[idx1] = sequenceIndices[idx2]; sequenceIndices[idx2] = tmp; } for (i = 0; i < sequenceIndices.size(); i++ ){ sequences[sequenceIndices[i]]->PrintSeq(out); } return 0; }
int main(int argc, char* argv[]) { CommandLineParser clp; string fastaFileName, indexFileName; vector<string> fastaFileNames; vector<string> opts; clp.SetProgramName("bsdb"); clp.SetProgramSummary("Build an index database on a file of sequences.\n" " The index is used to map to reads given alignment positions.\n"); clp.RegisterStringOption("fasta", &fastaFileName, "A file with sequences to build an index."); clp.RegisterStringOption("index", &indexFileName, "The index file."); clp.RegisterPreviousFlagsAsHidden(); clp.ParseCommandLine(argc, argv, opts); ifstream fastaIn; ofstream indexOut; if (FileOfFileNames::IsFOFN(fastaFileName)) { FileOfFileNames::FOFNToList(fastaFileName, fastaFileNames); } else { fastaFileNames.push_back(fastaFileName); } CrucialOpen(indexFileName, indexOut, std::ios::out | std::ios::binary); SequenceIndexDatabase<FASTASequence> seqDB; int fileNameIndex; for (fileNameIndex = 0; fileNameIndex < fastaFileNames.size(); fileNameIndex++){ FASTAReader reader; FASTASequence seq; reader.Init(fastaFileNames[fileNameIndex]); int i = 0; while (reader.GetNext(seq)) { seqDB.AddSequence(seq); i++; } } seqDB.Finalize(); seqDB.WriteDatabase(indexOut); return 0; }
int main(int argc, char* argv[]) { if (argc < 4) { cout << "usage: splitContigs in.fa contiglength out" << endl; exit(1); } string inFileName, outFileName; inFileName = argv[1]; int contigLength = atoi(argv[2]); outFileName = argv[3]; ofstream seqOut; CrucialOpen(outFileName, seqOut, std::ios::out); FASTAReader reader; reader.Init(inFileName); FASTASequence seq; DNALength curOffset; while(reader.GetNext(seq)) { FASTASequence subseq; int i; curOffset = 0; for (i =0 ; i < seq.length / contigLength + 1; i++ ) { subseq.seq = &seq.seq[curOffset]; subseq.title = seq.title; if (curOffset + contigLength > seq.length) { subseq.length = seq.length - curOffset; } else { subseq.length = contigLength; } subseq.PrintSeq(seqOut); curOffset += contigLength; } } return 0; }
int main(int argc, char* argv[1]) { if (argc < 3) { cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl; cout << " genome.fasta.sa must exist." << endl; cout << " Finds sequences at least effective_k in length that are unique." << endl; cout << " -max m Allow up to m matches" << endl; cout << " -minLength l Ensure the length of the match is at least this." << endl; cout << " -prefix p n Allow up to n matches across a prefix of length p" << endl; cout << " -suffix s n Allow up to n matches across a suffix of length s" << endl; cout << " Prefix and suffix options override max." << endl; cout << " -out file Print queries to this output file (query.fasta.queries)" << endl; exit(0); } DNASuffixArray sarray; string genomeFileName = argv[1]; string suffixArrayFileName = genomeFileName + ".sa"; FASTAReader reader; FASTASequence genome; int maxN = 0; int prefix = 0; int suffix = 0; int prefixN = 0; int suffixN = 0; int argi = 4; string outputFileName = ""; int minLength = 0; while (argi < argc) { if (strcmp(argv[argi], "-max") == 0) { ++argi; maxN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-prefix") == 0) { ++argi; prefix = atoi(argv[argi]); ++argi; prefixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-suffix") == 0) { ++argi; suffix = atoi(argv[argi]); ++argi; suffixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-out") == 0) { ++argi; outputFileName = argv[argi]; } else if (strcmp(argv[argi], "-minLength") == 0) { ++argi; minLength = atoi(argv[argi]); } ++argi; } reader.Initialize(genomeFileName); reader.ReadAllSequencesIntoOne(genome); sarray.Read(suffixArrayFileName); FASTAReader queryReader; FASTASequence querySequence; string queryFileName = argv[2]; int maxLength = atoi(argv[3]); string summaryTableFileName = queryFileName + ".summary"; if (outputFileName == "") { outputFileName = queryFileName + ".queries"; } ofstream summaryTable(summaryTableFileName.c_str()); ofstream outputFile(outputFileName.c_str()); queryReader.Initialize(queryFileName); while (queryReader.GetNext(querySequence)) { int i; cerr << "searching " << querySequence.title << endl; if (querySequence.length < maxLength) { continue; } int nMatches = 0; querySequence.ToUpper(); int localMax; for (i = 0; i < querySequence.length - maxLength + 1; i++) { if ((i + 1) % 100000 == 0) { cerr << "processed: " << i + 1 << endl; } int lcpLength; vector<SAIndex> lcpLeftBounds, lcpRightBounds; vector<SAIndex> rclcpLeftBounds, rclcpRightBounds; localMax = maxN; if (i < prefix) { localMax = prefixN; } if (i >= querySequence.length - suffix) { localMax = suffixN; } if (querySequence.length - i <= maxLength) { continue; } if (querySequence.seq[i] == 'N') { continue; } lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. &querySequence.seq[i], querySequence.length-i, true, maxLength, lcpLeftBounds, lcpRightBounds, false); if (lcpLength < minLength) { continue; } if (lcpLength < maxLength or lcpRightBounds.size() == 0 or (lcpRightBounds.size() > 0 and lcpLeftBounds.size() > 0 and lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) { FASTASequence rc; DNASequence subseq; subseq.ReferenceSubstring(querySequence, i, maxLength); subseq.MakeRC(rc); int rclcpLength; int numForwardMatches; if (lcpLength == 0) { numForwardMatches = 0; } else { numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1]; } rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. rc.seq, maxLength, true, rclcpLength, rclcpLeftBounds, rclcpRightBounds, false); string rcstr((const char*)rc.seq, rc.length); if (rclcpLength < maxLength or rclcpRightBounds.size() == 0 or (numForwardMatches + rclcpRightBounds[rclcpRightBounds.size() - 1] - rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) { char* substr = new char[maxLength+1]; substr[maxLength] = '\0'; memcpy(substr, &querySequence.seq[i], maxLength); // string substr = string((const char*) querySequence.seq, i, maxLength); outputFile << querySequence.title << "\t" << substr << "\t" << i << endl; ++nMatches; delete[] substr; // } } rc.Free(); } } summaryTable << querySequence.title << "\t" << nMatches << endl; querySequence.Free(); } outputFile.close(); genome.Free(); }
int main(int argc, char* argv[]) { string ad1File, ad2File, readsFile, readsOutFile; FASTAReader ad1Reader; FASTAReader ad2Reader; FASTAReader reader; CommandLineParser cl; float minPctSimilarity = 0.60; int indel = 3; int minLength = 10; cl.RegisterStringOption("ad1", &ad1File, "FASTA file with the first adapter"); cl.RegisterStringOption("ad2", &ad2File, "FASTA file with the second adapter"); cl.RegisterStringOption("reads", &readsFile, "FASTA file with SMRTBell reads"); cl.RegisterStringOption("readsout", &readsOutFile, "output file for split reads"); cl.RegisterPreviousFlagsAsHidden(); cl.RegisterFloatOption("pctSim", &minPctSimilarity, "Minimum percent similarity to trigger a match to an adapter.", CommandLineParser::PositiveFloat); cl.RegisterIntOption("indel", &indel, "Penalty for indel (positive)", CommandLineParser::NonNegativeInteger); cl.RegisterIntOption("minLength", &minLength, "Minimum length pass to retain.", CommandLineParser::PositiveInteger); vector<string> opts; cl.ParseCommandLine(argc, argv, opts); /* * Open all the required files, quitting if they are unavailable. */ ad1Reader.Init(ad1File); ad2Reader.Init(ad2File); reader.Init(readsFile); ofstream splitOut; CrucialOpen(readsOutFile, splitOut); FASTASequence ad1, ad2; ad1Reader.GetNext(ad1); ad2Reader.GetNext(ad2); FASTASequence read; vector<int> scoreMat; vector<Arrow> pathMat; int readIndex = 0; while(reader.GetNext(read)) { read.ToUpper(); // // Do a fitting sequence alignment to match one of the two // adapters into the read. // vector<int> passStarts, passLengths, la; read.PrintSeq(cout); SplitRead(read, 0, read.length, ad1, ad2, indel, passStarts, passLengths,la, 0, scoreMat, pathMat, minPctSimilarity, minLength); int i; for (i = 0; i < passStarts.size(); i++) { cout << "read: " << readIndex << " pass: "******" " << passStarts[i] << " " << passLengths[i] << " " << la[i] << endl; } ++readIndex; } }
int main(int argc, char* argv[]) { CommandLineParser clp; string refGenomeName; string mutGenomeName; string gffFileName; float insRate = 0; float delRate = 0; float mutRate = 0; bool lower = false; gffFileName = ""; clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true); clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome."); clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false); vector<string> leftovers; clp.ParseCommandLine(argc, argv, leftovers); FASTAReader reader; FASTASequence refGenome; reader.Init(refGenomeName); ofstream mutGenomeOut; CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out); ofstream gffOut; if (gffFileName != "") { CrucialOpen(gffFileName, gffOut, std::ios::out); } vector<int> insIndices, delIndices, subIndices; int readIndex = 0; InitializeRandomGeneratorWithTime(); while (reader.GetNext(refGenome)) { insIndices.resize(refGenome.length); delIndices.resize(refGenome.length); subIndices.resize(refGenome.length); std::fill(insIndices.begin(), insIndices.end(), false); std::fill(delIndices.begin(), delIndices.end(), false); std::fill(subIndices.begin(), subIndices.end(), 0); enum ChangeType { Ins, Del, Mut, None}; float changeProb[4]; changeProb[Ins] = insRate; changeProb[Del] = changeProb[Ins] + delRate; changeProb[Mut] = changeProb[Del] + mutRate; changeProb[None] = 1; if (changeProb[Mut] > 1) { cout << "ERROR! The sum of the error probabilities must be less than 1" << endl; exit(1); } DNALength pos; float randomNumber; int numIns = 0; int numDel = 0; int numMut = 0; for (pos =0 ; pos < refGenome.length; pos++) { randomNumber = Random(); if (randomNumber < changeProb[Ins]) { insIndices[pos] = true; numIns++; } else if (randomNumber < changeProb[Del]) { delIndices[pos] = true; numDel++; } else if (randomNumber < changeProb[Mut]){ Nucleotide newNuc = TwoBitToAscii[RandomInt(4)]; int maxIts = 100000; int it = 0; while (newNuc == refGenome.seq[pos]) { newNuc = TwoBitToAscii[RandomInt(4)]; if (it == maxIts) { cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl; exit(1); } } subIndices[pos] = refGenome[pos]; refGenome.seq[pos] = ToLower(newNuc,lower); ++numMut; } } // cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl; if (readIndex % 100000 == 0 && readIndex > 0) { cout << readIndex << endl; } // // Now add the insertions and deletions. // FASTASequence newSequence; DNALength newPos; if (numIns - numDel + refGenome.length < 0) { cout << "ERROR, the genome has been deleted to nothing." << endl; exit(1); } ResizeSequence(newSequence, refGenome.length + (numIns - numDel)); newPos = 0; pos = 0; for (pos = 0; pos < refGenome.length; pos++) { assert(newPos < newSequence.length or delIndices[pos] == true); if (subIndices[pos] != 0 and gffFileName != "") { gffOut << refGenome.GetName() << " . SNV " << newPos << " " << newPos <<" 0.00 . . reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl; } if (insIndices[pos] == true) { newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower); newPos++; newSequence.seq[newPos] = refGenome.seq[pos]; assert(newSequence.seq[newPos] != '1'); assert(newSequence.seq[newPos] != 1); if (gffFileName != "") { gffOut << refGenome.GetName() << " . deletion " << newPos << " " << newPos << " 0.00 . . reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl; } newPos++; } else if (delIndices[pos] == true) { // no-op, skip if (gffFileName != "") { gffOut << refGenome.GetName() << " . insertion " << newPos << " " << newPos << " 0.00 . . confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl; //ref000001 . deletion 20223 20223 0.00 . . reference=T;length=1;confidence=0;coverage=0;Name=20222delT } } else { newSequence.seq[newPos] = refGenome.seq[pos]; newPos++; } } stringstream titlestrm; titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate; newSequence.CopyTitle(refGenome.title); newSequence.AppendToTitle(titlestrm.str()); newSequence.PrintSeq(mutGenomeOut); newSequence.Free(); readIndex++; } }
int main(int argc, char* argv[]) { FASTAReader reader; if (argc < 5) { cout << "usage: wordCounter seqFile tupleSize tupleOutputFile posOutputFile" << endl; exit(1); } string fileName = argv[1]; int tupleSize = atoi(argv[2]); string tupleListName = argv[3]; string posOutName = argv[4]; TupleMetrics tm; tm.Initialize(tupleSize); reader.Init(fileName); FASTASequence seq; reader.GetNext(seq); vector<CountedDNATuple> tupleList; CountedDNATuple tuple; DNALength i; for (i = 0; i < seq.length - tm.tupleSize + 1; i++ ) { if (tuple.FromStringRL((Nucleotide*) (seq.seq + i), tm)) { tuple.count = i; tupleList.push_back(tuple); } } std::sort(tupleList.begin(), tupleList.end()); int t; int t2; int numTuples = tupleList.size(); t = t2 = 0; int numUnique = 0; while (t < numTuples) { t2 = t; t2++; while (t2 < numTuples and tupleList[t] == tupleList[t2]) { t2++; } ++numUnique; t = t2; } ofstream countedTupleListOut; countedTupleListOut.open(tupleListName.c_str(), ios_base::binary); ofstream posOut; posOut.open(posOutName.c_str(), ios_base::binary); countedTupleListOut.write((const char*) &numUnique, sizeof(int)); countedTupleListOut.write((const char*) &tm.tupleSize, sizeof(int)); posOut.write((const char*) &numUnique, sizeof(int)); // // Write out the tuple+counts to a file. // t = t2 = 0; CountedDNATuple countedTuple; int numMultOne = 0; while (t < numTuples) { t2 = t; t2++; while (t2 < numTuples and tupleList[t] == tupleList[t2]) { t2++; } countedTuple.tuple = tupleList[t].tuple; countedTuple.count = t2 - t; if (countedTuple.count == 1) ++numMultOne; countedTupleListOut.write((const char*) &countedTuple,sizeof(CountedDNATuple)); posOut.write((char*)&countedTuple.count, sizeof(int)); int tc; for (tc = t; tc < t2; tc++) { posOut.write((char*) &tupleList[tc].count, sizeof(int)); } t = t2; } // // Write out the positions of the tuples to a file. // posOut.close(); countedTupleListOut.close(); // cout << "found " << numUnique << " distinct " << DNATuple::TupleSize << "-mers." << endl; cout << numMultOne << endl; return 0; }
int main(int argc, char* argv[]) { string cmpFileName; string refFileName; string readsFileName; string mapqvTrackName; if (argc < 2) { cout << " printMapqvTrack: print a gff file of the average mapping quality value" << endl; exit(1); } vector<int> refPositions; cmpFileName = argv[1]; refFileName = argv[2]; mapqvFileName = argv[3]; CmpFile cmpFile; FASTASequence ref; FASTAReader reader; reader.Initialize(refFileName); reader.GetNext(ref); HDFBasReader basReader; SMRTSequence seq, *seqPtr; vector<int> refCoverage; refCoverage.resize(ref.length); std::fill(refCoverage.begin(), refCoverage.end(), 0); /* * These guys pull information from the same pls file. */ HDFCmpReader<CmpAlignment> cmpReader; if (cmpReader.Initialize(cmpFileName) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(1); } cmpReader.Read(cmpFile); UInt alignmentIndex; // movieIndexSets.resize(nMovies); for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) { int refSeqId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefSeqId(); int readGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetReadGroupId(); int refSeqIdIndex; if (cmpFile.refSeqTable.GetIndexOfId(refSeqId, refSeqIdIndex) == false) { // // Sanity check -- we're only looking at alignments to references in the cmp file. // cout << "ERROR, ref seq id: " << refSeqId << " should exist in the cmp file but it does not." << endl; assert(0); } int readGroupIdIndex; cmpFile.readGroupTable.GetIndexOfId(readGroupId, readGroupIdIndex); string readGroupPath = cmpFile.readGroupTable.names[readGroupIdIndex]; string readGroup = cmpReader.readGroupPathToReadGroup[readGroupPath]; int readGroupArrayIndex = cmpReader.refAlignGroups[refSeqIdIndex]->experimentNameToIndex[readGroup]; vector<char> alignedSequence, alignedTarget; // // This read overlaps one of the ref positions. UInt offsetEnd, offsetBegin; offsetEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd(); offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); vector<unsigned char> byteAlignment; int alignedSequenceLength = offsetEnd - offsetBegin; if (alignedSequenceLength >= 0) { alignedSequence.resize(alignedSequenceLength); alignedTarget.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } cmpReader.refAlignGroups[refSeqIdIndex]->readGroups[readGroupArrayIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]); UInt refStart = cmpFile.alnInfo.alignments[alignmentIndex].GetRefStart(); UInt refEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetRefEnd(); UInt readStart= cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart(); UInt readEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd(); // // Read the alignment string. // if (refSeqIdIndex > 0) continue; // // Convert to something we can compare easily. // alignedSequence[alignedSequence.size()-1]= '\0'; ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]); ByteAlignmentToRefString(&byteAlignment[0], byteAlignment.size(), &alignedTarget[0]); int gi, i; gi = 0; int refStrand = cmpFile.alnInfo.alignments[alignmentIndex].GetRCRefStrand(); if (refStrand == 1) { // revcomp the ref strand vector<char> rcAlignedTarget, rcAlignedQuery; int t; rcAlignedTarget.resize(alignedTarget.size()); rcAlignedQuery.resize(alignedSequence.size()); for (t = 0; t < alignedTarget.size(); t++) { if (alignedTarget[t] == ' ') { rcAlignedTarget[alignedTarget.size() - t - 1] = ' '; } else { rcAlignedTarget[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]]; } if (alignedSequence[t] == ' '){ rcAlignedQuery[alignedTarget.size() - t - 1] = ' '; } else { rcAlignedQuery[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]]; } } alignedTarget = rcAlignedTarget; alignedSequence = rcAlignedQuery; } int holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber(); int ri = readStart; gi = refStart; for (i = 0; i < alignedTarget.size(); i++, gi++, ri++ ) { while(i < alignedTarget.size() and alignedTarget[i] == ' ') { i++; } if (alignedSequence[i] != ' ') { refCoverage[gi]++; } } } // end looping over regions // Now compute the number of gaps. UInt pos; int numNotCovered = 0; for (pos = 0; pos < refCoverage.size(); pos++ ){ if (refCoverage[pos] < 1) { numNotCovered++;} } if (numNotCovered > 100) { cout << "TOO Many!!!" << endl; } else { for (pos = 0; pos < refCoverage.size(); pos++ ){ // cout << refCoverage[pos] << endl; if (refCoverage[pos] < 1) { int left, right; left = right = -1; if (pos > 0) { left = refCoverage[pos-1];} if (pos < refCoverage.size()-1) {right = refCoverage[pos+1];} cout << pos << " " << left << " " << right << endl; } } } }