void FASTAReader::ReadTitle(GenomeLength &p, FASTASequence & seq) { char * seqTitle = NULL; int seqTitleLen; ReadTitle(p, seqTitle, seqTitleLen); seq.CopyTitle(seqTitle, seqTitleLen); if (seqTitle) {delete [] seqTitle;} }
int main(int argc, char* argv[]) { string genomeFileName, subseqFileName; if (argc != 3) { cout << "usage: extractRepeats genome repeat" << endl; exit(0); } genomeFileName = argv[1]; subseqFileName = argv[2]; FASTASequence genome, sub; FASTAReader reader; reader.Init(genomeFileName); reader.GetNext(genome); reader.Init(subseqFileName); reader.GetNext(sub); genome.ToUpper(); sub.ToUpper(); DNALength genomePos; FASTASequence genomeSub; int kband = (int) (0.15) * sub.length; vector<int> scoreMat; vector<Arrow> pathMat; int readIndex = 0; cout << "starting extraction" << endl; for (genomePos = 0; genomePos < genome.length - sub.length + 1; genomePos++) { genomeSub.seq = &genome.seq[genomePos]; genomeSub.length = sub.length; int alignScore; Alignment alignment; alignScore = SWAlign(genomeSub, sub, EditDistanceMatrix, 1, //1,kband, scoreMat, pathMat, alignment, QueryFit); if (alignScore < 0.25 * sub.length) { stringstream titlestrm; titlestrm << readIndex << "|" << genomePos << "|" << genomePos + sub.length << " " << alignScore/ (1.0*sub.length); FASTASequence subcopy; subcopy.CopyTitle(titlestrm.str()); subcopy.seq = &genome.seq[genomePos]; subcopy.length = sub.length; subcopy.PrintSeq(std::cout); genomePos += sub.length; } } }
int main(int argc, char* argv[]) { string barcodeFileName, insertFileName, outputFileName; if (argc != 4) { cout << "usage: makeBarcodeDatabase insert.fasta barcodes.fasta output.fasta" << endl; exit(1); } insertFileName = argv[1]; barcodeFileName = argv[2]; outputFileName = argv[3]; FASTAReader barcodeReader, insertReader; barcodeReader.Initialize(barcodeFileName); insertReader.Initialize(insertFileName); ofstream barcodedOut; CrucialOpen(outputFileName, barcodedOut, std::ios::out); vector<FASTASequence> forwardBarcodes, reverseBarcodes; FASTASequence barcodeSequence, reverseBarcodeSequence; while(barcodeReader.GetNext(barcodeSequence)) { forwardBarcodes.push_back(barcodeSequence); barcodeSequence.MakeRC(reverseBarcodeSequence); reverseBarcodes.push_back(reverseBarcodeSequence); } FASTASequence insert; insertReader.GetNext(insert); int i; for (i = 0; i < forwardBarcodes.size(); i++) { FASTASequence barcodedInsert; barcodedInsert.Resize(forwardBarcodes[i].length * 2 + insert.length); stringstream titleStrm; titleStrm << insert.title << "|ff|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|fr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rf|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); } }
int main(int argc, char* argv[]) { CommandLineParser clp; string refGenomeName; string mutGenomeName; string gffFileName; float insRate = 0; float delRate = 0; float mutRate = 0; bool lower = false; gffFileName = ""; clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true); clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome."); clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false); vector<string> leftovers; clp.ParseCommandLine(argc, argv, leftovers); FASTAReader reader; FASTASequence refGenome; reader.Init(refGenomeName); ofstream mutGenomeOut; CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out); ofstream gffOut; if (gffFileName != "") { CrucialOpen(gffFileName, gffOut, std::ios::out); } vector<int> insIndices, delIndices, subIndices; int readIndex = 0; InitializeRandomGeneratorWithTime(); while (reader.GetNext(refGenome)) { insIndices.resize(refGenome.length); delIndices.resize(refGenome.length); subIndices.resize(refGenome.length); std::fill(insIndices.begin(), insIndices.end(), false); std::fill(delIndices.begin(), delIndices.end(), false); std::fill(subIndices.begin(), subIndices.end(), 0); enum ChangeType { Ins, Del, Mut, None}; float changeProb[4]; changeProb[Ins] = insRate; changeProb[Del] = changeProb[Ins] + delRate; changeProb[Mut] = changeProb[Del] + mutRate; changeProb[None] = 1; if (changeProb[Mut] > 1) { cout << "ERROR! The sum of the error probabilities must be less than 1" << endl; exit(1); } DNALength pos; float randomNumber; int numIns = 0; int numDel = 0; int numMut = 0; for (pos =0 ; pos < refGenome.length; pos++) { randomNumber = Random(); if (randomNumber < changeProb[Ins]) { insIndices[pos] = true; numIns++; } else if (randomNumber < changeProb[Del]) { delIndices[pos] = true; numDel++; } else if (randomNumber < changeProb[Mut]){ Nucleotide newNuc = TwoBitToAscii[RandomInt(4)]; int maxIts = 100000; int it = 0; while (newNuc == refGenome.seq[pos]) { newNuc = TwoBitToAscii[RandomInt(4)]; if (it == maxIts) { cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl; exit(1); } } subIndices[pos] = refGenome[pos]; refGenome.seq[pos] = ToLower(newNuc,lower); ++numMut; } } // cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl; if (readIndex % 100000 == 0 && readIndex > 0) { cout << readIndex << endl; } // // Now add the insertions and deletions. // FASTASequence newSequence; DNALength newPos; if (numIns - numDel + refGenome.length < 0) { cout << "ERROR, the genome has been deleted to nothing." << endl; exit(1); } ResizeSequence(newSequence, refGenome.length + (numIns - numDel)); newPos = 0; pos = 0; for (pos = 0; pos < refGenome.length; pos++) { assert(newPos < newSequence.length or delIndices[pos] == true); if (subIndices[pos] != 0 and gffFileName != "") { gffOut << refGenome.GetName() << " . SNV " << newPos << " " << newPos <<" 0.00 . . reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl; } if (insIndices[pos] == true) { newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower); newPos++; newSequence.seq[newPos] = refGenome.seq[pos]; assert(newSequence.seq[newPos] != '1'); assert(newSequence.seq[newPos] != 1); if (gffFileName != "") { gffOut << refGenome.GetName() << " . deletion " << newPos << " " << newPos << " 0.00 . . reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl; } newPos++; } else if (delIndices[pos] == true) { // no-op, skip if (gffFileName != "") { gffOut << refGenome.GetName() << " . insertion " << newPos << " " << newPos << " 0.00 . . confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl; //ref000001 . deletion 20223 20223 0.00 . . reference=T;length=1;confidence=0;coverage=0;Name=20222delT } } else { newSequence.seq[newPos] = refGenome.seq[pos]; newPos++; } } stringstream titlestrm; titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate; newSequence.CopyTitle(refGenome.title); newSequence.AppendToTitle(titlestrm.str()); newSequence.PrintSeq(mutGenomeOut); newSequence.Free(); readIndex++; } }
int main(int argc, char* argv[]) { string gencodeGffFileName, genomeFileName, genesOutFileName; string geneType = "protein_coding"; bool randomSplicing = false; int numRandomSplicing = 1; float pSkip = 0.5; if (argc < 4) { cout << "Usage: extractGenes gencodeGTFFile genomeFile genesOutFileName [-geneType type (protein_coding)] [-randomSplicing] [-numRandomSplicing n] [-pSkip prob (0-1, default:0.5)]" << endl; exit(1); } gencodeGffFileName = argv[1]; genomeFileName = argv[2]; genesOutFileName = argv[3]; int argi = 4; string coordinatesFileName; while (argi < argc) { if (strcmp(argv[argi], "-geneType") == 0) { geneType = argv[++argi]; } else if (strcmp(argv[argi], "-randomSplicing") == 0) { randomSplicing = true; } else if (strcmp(argv[argi], "-numRandomSplicing") == 0) { numRandomSplicing = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-pSkip") == 0) { pSkip = atof(argv[++argi]); } else { cout << "ERROR, bad option " << argv[argi] << endl; exit(1); } ++argi; } coordinatesFileName = genesOutFileName; coordinatesFileName.append(".pos"); FASTAReader reader; reader.Initialize(genomeFileName); ofstream outFile, coordsFile; CrucialOpen(genesOutFileName, outFile, std::ios::out); string coordsFileName = genesOutFileName + ".coords"; CrucialOpen(coordsFileName, coordsFile, std::ios::out); vector<FASTASequence> referenceSequences; reader.ReadAllSequences(referenceSequences); int i; map<string, int> titleToIndex; for (i = 0; i < referenceSequences.size(); i++) { titleToIndex[referenceSequences[i].title] = i; } GencodeGFFFile gencodeFile; gencodeFile.ReadAll(gencodeGffFileName); vector<GencodeGFFGene> genes; IndexGencodeGenes(gencodeFile, genes, geneType); for (i = 0; i < genes.size(); i++) { genes[i].OrderExonsByStart(); } int e; for (i = 0; i < genes.size(); i++) { FASTASequence geneSequence; geneSequence.CopyTitle(genes[i].geneName); if (titleToIndex.find(genes[i].chromosome) == titleToIndex.end()) { continue; } int chrIndex = titleToIndex[genes[i].chromosome]; string sequence = ""; // // Do nothing with 0 length exons. // if (genes[i].exons.size() == 0) { continue; } vector<FASTASequence> geneSequences; vector<GeneCoordinates> geneCoordinates; genes[i].GenerateGeneSequences(referenceSequences[chrIndex], geneSequences, geneCoordinates, randomSplicing); int gi; for (gi = 0; gi < geneSequences.size(); gi++) { if (genes[i].GetStrand() == '+') { geneSequences[gi].PrintSeq(outFile); } else { FASTASequence rc; geneSequences[gi].MakeRC(rc); rc.PrintSeq(outFile); rc.Free(); } coordsFile << geneSequences[gi].title << " " << geneCoordinates[gi].chromosome << " " << geneCoordinates[gi].exonCoordinates.size() << " " << geneCoordinates[gi].strand; int i; for (i = 0; i < geneCoordinates[gi].exonCoordinates.size(); i++) { coordsFile << " " << geneCoordinates[gi].exonCoordinates[i].start << " " << geneCoordinates[gi].exonCoordinates[i].end << " "; } coordsFile << endl; geneSequences[gi].Free(); } // // No need to free the seq, since it is controlled by the string. // } coordsFile.close(); }
void FASTASequence::MakeRC(FASTASequence &rhs, DNALength rhsPos, DNALength rhsLength) { DNASequence::MakeRC((DNASequence&) rhs, rhsPos, rhsLength); if (title != NULL) { rhs.CopyTitle(title); } }