int main(int argc, char* argv[]) { std::string seqInName, seqOutName, dotOutName; if (argc < 4) { std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl; std::exit(EXIT_FAILURE); } seqInName = argv[1]; dotOutName = argv[2]; seqOutName = argv[3]; FASTAReader reader; reader.Initialize(seqInName); FASTASequence origSeq; reader.GetNext(origSeq); std::ifstream dotOutFile; CrucialOpen(dotOutName, dotOutFile); std::ofstream seqOutFile; std::ofstream seqOut; CrucialOpen(seqOutName, seqOut, std::ios::out); std::string dotOutLine; getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); while (getline(dotOutFile, dotOutLine)) { std::stringstream lineStrm(dotOutLine); int swScore; float pctDiv, pctDel, pctIns; std::string query; DNALength qPosBegin, qPosEnd; std::string left; char strand; std::string matchingRepeat; std::string repClass; std::string repPos, repEnd, repLeft; int id; lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >> left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id; for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) { origSeq.seq[seqPos] = 'X'; } } DNALength seqPos, unexPos; unexPos = 0; for (seqPos = 0; seqPos < origSeq.length; seqPos++) { if (origSeq.seq[seqPos] != 'X') { origSeq.seq[unexPos] = origSeq.seq[seqPos]; unexPos++; } } origSeq.length = unexPos; origSeq.PrintSeq(seqOut); return 0; }
int main(int argc, char* argv[]) { string genomeFileName, subseqFileName; if (argc != 3) { cout << "usage: extractRepeats genome repeat" << endl; exit(0); } genomeFileName = argv[1]; subseqFileName = argv[2]; FASTASequence genome, sub; FASTAReader reader; reader.Init(genomeFileName); reader.GetNext(genome); reader.Init(subseqFileName); reader.GetNext(sub); genome.ToUpper(); sub.ToUpper(); DNALength genomePos; FASTASequence genomeSub; int kband = (int) (0.15) * sub.length; vector<int> scoreMat; vector<Arrow> pathMat; int readIndex = 0; cout << "starting extraction" << endl; for (genomePos = 0; genomePos < genome.length - sub.length + 1; genomePos++) { genomeSub.seq = &genome.seq[genomePos]; genomeSub.length = sub.length; int alignScore; Alignment alignment; alignScore = SWAlign(genomeSub, sub, EditDistanceMatrix, 1, //1,kband, scoreMat, pathMat, alignment, QueryFit); if (alignScore < 0.25 * sub.length) { stringstream titlestrm; titlestrm << readIndex << "|" << genomePos << "|" << genomePos + sub.length << " " << alignScore/ (1.0*sub.length); FASTASequence subcopy; subcopy.CopyTitle(titlestrm.str()); subcopy.seq = &genome.seq[genomePos]; subcopy.length = sub.length; subcopy.PrintSeq(std::cout); genomePos += sub.length; } } }
int main(int argc, char* argv[]) { if (argc < 4) { cout << "usage: splitContigs in.fa contiglength out" << endl; exit(1); } string inFileName, outFileName; inFileName = argv[1]; int contigLength = atoi(argv[2]); outFileName = argv[3]; ofstream seqOut; CrucialOpen(outFileName, seqOut, std::ios::out); FASTAReader reader; reader.Init(inFileName); FASTASequence seq; DNALength curOffset; while(reader.GetNext(seq)) { FASTASequence subseq; int i; curOffset = 0; for (i =0 ; i < seq.length / contigLength + 1; i++ ) { subseq.seq = &seq.seq[curOffset]; subseq.title = seq.title; if (curOffset + contigLength > seq.length) { subseq.length = seq.length - curOffset; } else { subseq.length = contigLength; } subseq.PrintSeq(seqOut); curOffset += contigLength; } } return 0; }
int main(int argc, char* argv[]) { string barcodeFileName, insertFileName, outputFileName; if (argc != 4) { cout << "usage: makeBarcodeDatabase insert.fasta barcodes.fasta output.fasta" << endl; exit(1); } insertFileName = argv[1]; barcodeFileName = argv[2]; outputFileName = argv[3]; FASTAReader barcodeReader, insertReader; barcodeReader.Initialize(barcodeFileName); insertReader.Initialize(insertFileName); ofstream barcodedOut; CrucialOpen(outputFileName, barcodedOut, std::ios::out); vector<FASTASequence> forwardBarcodes, reverseBarcodes; FASTASequence barcodeSequence, reverseBarcodeSequence; while(barcodeReader.GetNext(barcodeSequence)) { forwardBarcodes.push_back(barcodeSequence); barcodeSequence.MakeRC(reverseBarcodeSequence); reverseBarcodes.push_back(reverseBarcodeSequence); } FASTASequence insert; insertReader.GetNext(insert); int i; for (i = 0; i < forwardBarcodes.size(); i++) { FASTASequence barcodedInsert; barcodedInsert.Resize(forwardBarcodes[i].length * 2 + insert.length); stringstream titleStrm; titleStrm << insert.title << "|ff|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|fr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rf|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); } }
int main(int argc, char* argv[]) { string ad1File, ad2File, readsFile, readsOutFile; FASTAReader ad1Reader; FASTAReader ad2Reader; FASTAReader reader; CommandLineParser cl; float minPctSimilarity = 0.60; int indel = 3; int minLength = 10; cl.RegisterStringOption("ad1", &ad1File, "FASTA file with the first adapter"); cl.RegisterStringOption("ad2", &ad2File, "FASTA file with the second adapter"); cl.RegisterStringOption("reads", &readsFile, "FASTA file with SMRTBell reads"); cl.RegisterStringOption("readsout", &readsOutFile, "output file for split reads"); cl.RegisterPreviousFlagsAsHidden(); cl.RegisterFloatOption("pctSim", &minPctSimilarity, "Minimum percent similarity to trigger a match to an adapter.", CommandLineParser::PositiveFloat); cl.RegisterIntOption("indel", &indel, "Penalty for indel (positive)", CommandLineParser::NonNegativeInteger); cl.RegisterIntOption("minLength", &minLength, "Minimum length pass to retain.", CommandLineParser::PositiveInteger); vector<string> opts; cl.ParseCommandLine(argc, argv, opts); /* * Open all the required files, quitting if they are unavailable. */ ad1Reader.Init(ad1File); ad2Reader.Init(ad2File); reader.Init(readsFile); ofstream splitOut; CrucialOpen(readsOutFile, splitOut); FASTASequence ad1, ad2; ad1Reader.GetNext(ad1); ad2Reader.GetNext(ad2); FASTASequence read; vector<int> scoreMat; vector<Arrow> pathMat; int readIndex = 0; while(reader.GetNext(read)) { read.ToUpper(); // // Do a fitting sequence alignment to match one of the two // adapters into the read. // vector<int> passStarts, passLengths, la; read.PrintSeq(cout); SplitRead(read, 0, read.length, ad1, ad2, indel, passStarts, passLengths,la, 0, scoreMat, pathMat, minPctSimilarity, minLength); int i; for (i = 0; i < passStarts.size(); i++) { cout << "read: " << readIndex << " pass: "******" " << passStarts[i] << " " << passLengths[i] << " " << la[i] << endl; } ++readIndex; } }
int main(int argc, char* argv[]) { CommandLineParser clp; string refGenomeName; string mutGenomeName; string gffFileName; float insRate = 0; float delRate = 0; float mutRate = 0; bool lower = false; gffFileName = ""; clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true); clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome."); clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false); vector<string> leftovers; clp.ParseCommandLine(argc, argv, leftovers); FASTAReader reader; FASTASequence refGenome; reader.Init(refGenomeName); ofstream mutGenomeOut; CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out); ofstream gffOut; if (gffFileName != "") { CrucialOpen(gffFileName, gffOut, std::ios::out); } vector<int> insIndices, delIndices, subIndices; int readIndex = 0; InitializeRandomGeneratorWithTime(); while (reader.GetNext(refGenome)) { insIndices.resize(refGenome.length); delIndices.resize(refGenome.length); subIndices.resize(refGenome.length); std::fill(insIndices.begin(), insIndices.end(), false); std::fill(delIndices.begin(), delIndices.end(), false); std::fill(subIndices.begin(), subIndices.end(), 0); enum ChangeType { Ins, Del, Mut, None}; float changeProb[4]; changeProb[Ins] = insRate; changeProb[Del] = changeProb[Ins] + delRate; changeProb[Mut] = changeProb[Del] + mutRate; changeProb[None] = 1; if (changeProb[Mut] > 1) { cout << "ERROR! The sum of the error probabilities must be less than 1" << endl; exit(1); } DNALength pos; float randomNumber; int numIns = 0; int numDel = 0; int numMut = 0; for (pos =0 ; pos < refGenome.length; pos++) { randomNumber = Random(); if (randomNumber < changeProb[Ins]) { insIndices[pos] = true; numIns++; } else if (randomNumber < changeProb[Del]) { delIndices[pos] = true; numDel++; } else if (randomNumber < changeProb[Mut]){ Nucleotide newNuc = TwoBitToAscii[RandomInt(4)]; int maxIts = 100000; int it = 0; while (newNuc == refGenome.seq[pos]) { newNuc = TwoBitToAscii[RandomInt(4)]; if (it == maxIts) { cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl; exit(1); } } subIndices[pos] = refGenome[pos]; refGenome.seq[pos] = ToLower(newNuc,lower); ++numMut; } } // cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl; if (readIndex % 100000 == 0 && readIndex > 0) { cout << readIndex << endl; } // // Now add the insertions and deletions. // FASTASequence newSequence; DNALength newPos; if (numIns - numDel + refGenome.length < 0) { cout << "ERROR, the genome has been deleted to nothing." << endl; exit(1); } ResizeSequence(newSequence, refGenome.length + (numIns - numDel)); newPos = 0; pos = 0; for (pos = 0; pos < refGenome.length; pos++) { assert(newPos < newSequence.length or delIndices[pos] == true); if (subIndices[pos] != 0 and gffFileName != "") { gffOut << refGenome.GetName() << " . SNV " << newPos << " " << newPos <<" 0.00 . . reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl; } if (insIndices[pos] == true) { newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower); newPos++; newSequence.seq[newPos] = refGenome.seq[pos]; assert(newSequence.seq[newPos] != '1'); assert(newSequence.seq[newPos] != 1); if (gffFileName != "") { gffOut << refGenome.GetName() << " . deletion " << newPos << " " << newPos << " 0.00 . . reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl; } newPos++; } else if (delIndices[pos] == true) { // no-op, skip if (gffFileName != "") { gffOut << refGenome.GetName() << " . insertion " << newPos << " " << newPos << " 0.00 . . confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl; //ref000001 . deletion 20223 20223 0.00 . . reference=T;length=1;confidence=0;coverage=0;Name=20222delT } } else { newSequence.seq[newPos] = refGenome.seq[pos]; newPos++; } } stringstream titlestrm; titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate; newSequence.CopyTitle(refGenome.title); newSequence.AppendToTitle(titlestrm.str()); newSequence.PrintSeq(mutGenomeOut); newSequence.Free(); readIndex++; } }
int main(int argc, char* argv[]) { string gencodeGffFileName, genomeFileName, genesOutFileName; string geneType = "protein_coding"; bool randomSplicing = false; int numRandomSplicing = 1; float pSkip = 0.5; if (argc < 4) { cout << "Usage: extractGenes gencodeGTFFile genomeFile genesOutFileName [-geneType type (protein_coding)] [-randomSplicing] [-numRandomSplicing n] [-pSkip prob (0-1, default:0.5)]" << endl; exit(1); } gencodeGffFileName = argv[1]; genomeFileName = argv[2]; genesOutFileName = argv[3]; int argi = 4; string coordinatesFileName; while (argi < argc) { if (strcmp(argv[argi], "-geneType") == 0) { geneType = argv[++argi]; } else if (strcmp(argv[argi], "-randomSplicing") == 0) { randomSplicing = true; } else if (strcmp(argv[argi], "-numRandomSplicing") == 0) { numRandomSplicing = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-pSkip") == 0) { pSkip = atof(argv[++argi]); } else { cout << "ERROR, bad option " << argv[argi] << endl; exit(1); } ++argi; } coordinatesFileName = genesOutFileName; coordinatesFileName.append(".pos"); FASTAReader reader; reader.Initialize(genomeFileName); ofstream outFile, coordsFile; CrucialOpen(genesOutFileName, outFile, std::ios::out); string coordsFileName = genesOutFileName + ".coords"; CrucialOpen(coordsFileName, coordsFile, std::ios::out); vector<FASTASequence> referenceSequences; reader.ReadAllSequences(referenceSequences); int i; map<string, int> titleToIndex; for (i = 0; i < referenceSequences.size(); i++) { titleToIndex[referenceSequences[i].title] = i; } GencodeGFFFile gencodeFile; gencodeFile.ReadAll(gencodeGffFileName); vector<GencodeGFFGene> genes; IndexGencodeGenes(gencodeFile, genes, geneType); for (i = 0; i < genes.size(); i++) { genes[i].OrderExonsByStart(); } int e; for (i = 0; i < genes.size(); i++) { FASTASequence geneSequence; geneSequence.CopyTitle(genes[i].geneName); if (titleToIndex.find(genes[i].chromosome) == titleToIndex.end()) { continue; } int chrIndex = titleToIndex[genes[i].chromosome]; string sequence = ""; // // Do nothing with 0 length exons. // if (genes[i].exons.size() == 0) { continue; } vector<FASTASequence> geneSequences; vector<GeneCoordinates> geneCoordinates; genes[i].GenerateGeneSequences(referenceSequences[chrIndex], geneSequences, geneCoordinates, randomSplicing); int gi; for (gi = 0; gi < geneSequences.size(); gi++) { if (genes[i].GetStrand() == '+') { geneSequences[gi].PrintSeq(outFile); } else { FASTASequence rc; geneSequences[gi].MakeRC(rc); rc.PrintSeq(outFile); rc.Free(); } coordsFile << geneSequences[gi].title << " " << geneCoordinates[gi].chromosome << " " << geneCoordinates[gi].exonCoordinates.size() << " " << geneCoordinates[gi].strand; int i; for (i = 0; i < geneCoordinates[gi].exonCoordinates.size(); i++) { coordsFile << " " << geneCoordinates[gi].exonCoordinates[i].start << " " << geneCoordinates[gi].exonCoordinates[i].end << " "; } coordsFile << endl; geneSequences[gi].Free(); } // // No need to free the seq, since it is controlled by the string. // } coordsFile.close(); }