int main(int argc, char* argv[]) { std::string seqInName, seqOutName, dotOutName; if (argc < 4) { std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl; std::exit(EXIT_FAILURE); } seqInName = argv[1]; dotOutName = argv[2]; seqOutName = argv[3]; FASTAReader reader; reader.Initialize(seqInName); FASTASequence origSeq; reader.GetNext(origSeq); std::ifstream dotOutFile; CrucialOpen(dotOutName, dotOutFile); std::ofstream seqOutFile; std::ofstream seqOut; CrucialOpen(seqOutName, seqOut, std::ios::out); std::string dotOutLine; getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); while (getline(dotOutFile, dotOutLine)) { std::stringstream lineStrm(dotOutLine); int swScore; float pctDiv, pctDel, pctIns; std::string query; DNALength qPosBegin, qPosEnd; std::string left; char strand; std::string matchingRepeat; std::string repClass; std::string repPos, repEnd, repLeft; int id; lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >> left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id; for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) { origSeq.seq[seqPos] = 'X'; } } DNALength seqPos, unexPos; unexPos = 0; for (seqPos = 0; seqPos < origSeq.length; seqPos++) { if (origSeq.seq[seqPos] != 'X') { origSeq.seq[unexPos] = origSeq.seq[seqPos]; unexPos++; } } origSeq.length = unexPos; origSeq.PrintSeq(seqOut); return 0; }
int FASTAReader::ConcatenateNext(FASTASequence &cur) { FASTASequence next; int retVal; if ((retVal = GetNext(next))) { next.CleanupASCII(); cur.Concatenate((Nucleotide*) "N"); cur.Concatenate(next); } next.Free(); return retVal; }
int main(int argc, char* argv[]) { string genomeFileName, subseqFileName; if (argc != 3) { cout << "usage: extractRepeats genome repeat" << endl; exit(0); } genomeFileName = argv[1]; subseqFileName = argv[2]; FASTASequence genome, sub; FASTAReader reader; reader.Init(genomeFileName); reader.GetNext(genome); reader.Init(subseqFileName); reader.GetNext(sub); genome.ToUpper(); sub.ToUpper(); DNALength genomePos; FASTASequence genomeSub; int kband = (int) (0.15) * sub.length; vector<int> scoreMat; vector<Arrow> pathMat; int readIndex = 0; cout << "starting extraction" << endl; for (genomePos = 0; genomePos < genome.length - sub.length + 1; genomePos++) { genomeSub.seq = &genome.seq[genomePos]; genomeSub.length = sub.length; int alignScore; Alignment alignment; alignScore = SWAlign(genomeSub, sub, EditDistanceMatrix, 1, //1,kband, scoreMat, pathMat, alignment, QueryFit); if (alignScore < 0.25 * sub.length) { stringstream titlestrm; titlestrm << readIndex << "|" << genomePos << "|" << genomePos + sub.length << " " << alignScore/ (1.0*sub.length); FASTASequence subcopy; subcopy.CopyTitle(titlestrm.str()); subcopy.seq = &genome.seq[genomePos]; subcopy.length = sub.length; subcopy.PrintSeq(std::cout); genomePos += sub.length; } } }
void FASTAReader::ReadTitle(GenomeLength &p, FASTASequence & seq) { char * seqTitle = NULL; int seqTitleLen; ReadTitle(p, seqTitle, seqTitleLen); seq.CopyTitle(seqTitle, seqTitleLen); if (seqTitle) {delete [] seqTitle;} }
// Create a reverse complement FASTASequence of *this and assign to rhs. void FASTASequence::MakeRC(FASTASequence &rhs, DNALength rhsPos, DNALength rhsLength) { rhs.Free(); DNASequence::MakeRC((DNASequence&) rhs, rhsPos, rhsLength); if (title != NULL) { (static_cast<FASTASequence*>(&rhs))->CopyTitle(title); } }
int main(int argc, char* argv[]) { if (argc < 4) { cout << "usage: splitContigs in.fa contiglength out" << endl; exit(1); } string inFileName, outFileName; inFileName = argv[1]; int contigLength = atoi(argv[2]); outFileName = argv[3]; ofstream seqOut; CrucialOpen(outFileName, seqOut, std::ios::out); FASTAReader reader; reader.Init(inFileName); FASTASequence seq; DNALength curOffset; while(reader.GetNext(seq)) { FASTASequence subseq; int i; curOffset = 0; for (i =0 ; i < seq.length / contigLength + 1; i++ ) { subseq.seq = &seq.seq[curOffset]; subseq.title = seq.title; if (curOffset + contigLength > seq.length) { subseq.length = seq.length - curOffset; } else { subseq.length = contigLength; } subseq.PrintSeq(seqOut); curOffset += contigLength; } } return 0; }
int FASTAReader::GetNext(FASTASequence &seq) { if (curPos == fileSize) { return 0; } seq.Free(); //Free seq before read // // Extract the title of the current record. // GenomeLength p = curPos; AdvanceToTitleStart(p); // // Make sure there is a '>' // CheckValidTitleStart(p); ReadTitle(p, seq); // // Read in the next sequence. // // Count the length of the sequence. GenomeLength seqLength = 0; curPos = p; char c; while (p < fileSize and (c = filePtr[p]) != endOfReadDelim) { if (c != ' ' and c != '\t' and c != '\n' and c != '\r') { seqLength++; } p++; } if (seqLength > UINT_MAX) { cout << "ERROR! Reading sequences stored in more than 4Gbytes of space is not supported." << endl; exit(1); } seq.length = 0; if (seqLength > 0) { seq.length = seqLength; seq.seq = ProtectedNew <Nucleotide>(seqLength+padding+1); p = curPos; seq.deleteOnExit = true; GenomeLength s = 0; while (p < fileSize and (c = filePtr[p]) != endOfReadDelim) { if (c != ' ' and c != '\t' and c != '\n' and c != '\r') { seq.seq[s] = convMat[static_cast<unsigned char>(filePtr[p])]; s++; } p++; } seq.seq[seqLength] = 0; } curPos = p; if (computeMD5) { MakeMD5((const char*) &seq.seq, seq.length, curReadMD5); } return 1; }
GenomeLength FASTAReader::ReadAllSequencesIntoOne(FASTASequence &seq, SequenceIndexDatabase<FASTASequence> *seqDBPtr) { seq.Free(); GenomeLength p = curPos; AdvanceToTitleStart(p); CheckValidTitleStart(p); ReadTitle(p, seq); if (seq.title == NULL) { cout << "ERROR, sequence must have a nonempty title." << endl; exit(1); } if (seqDBPtr != NULL) { seqDBPtr->growableName.push_back(seq.title); } GenomeLength seqLength; seqLength = fileSize - p; GenomeLength memorySize = seqLength+padding+1; if (memorySize > UINT_MAX) { cout << "ERROR! Reading fasta files greater than 4Gbytes is not supported." << endl; exit(1); } seq.Resize(memorySize); GenomeLength i; i = 0L; for (; p < fileSize; p++, i++ ) { seq.seq[i] = filePtr[p]; } i = p = 0; while (p < seqLength) { // // If this is the beginning of another read, add an 'N' // to delineate spaces between reads. // while (p < seqLength and (seq.seq[p] == ' ' or seq.seq[p] == '\n' or seq.seq[p] == '\t' or seq.seq[p] == '\r')) { p++; } if (p < seqLength and seq.seq[p] == '>') { seq.seq[i] = 'N'; GenomeLength titleStartPos = p+1; i++; while (p < seqLength and seq.seq[p] != '\n') p++; if (seqDBPtr != NULL and p < seqLength) { string title; GenomeLength tp; for (tp = titleStartPos; tp < p; tp++) { title.push_back(seq.seq[tp]); } seqDBPtr->growableName.push_back(title); seqDBPtr->growableSeqStartPos.push_back(i); int nSeq = seqDBPtr->growableSeqStartPos.size(); if (nSeq > 1 and computeMD5) { string md5Str; MakeMD5((const char*) &seq.seq[seqDBPtr->growableSeqStartPos[nSeq-2]], seqDBPtr->growableSeqStartPos[nSeq-1] - seqDBPtr->growableSeqStartPos[nSeq-2] - 1, md5Str); seqDBPtr->md5.push_back(md5Str); } } } else if (p < seqLength) { // Otherwise, p may be at whitespace // advance past that as well. seq.seq[i] = convMat[seq.seq[p]]; i++; p++; } } if (i > UINT_MAX) { cout << "ERROR! Sequences greater than 4Gbase are not supported." << endl; exit(1); } // // Append an 'N' at the end of the last sequence for consistency // between different orderings of reference input. // seq.seq[i] = 'N'; i++; seq.length = i; // fill padding. for (; i < memorySize; i++ ){ seq.seq[i] = 0; } seq.deleteOnExit = true; if (seqDBPtr != NULL) { seqDBPtr->growableSeqStartPos.push_back(seq.length); int nSeq = seqDBPtr->growableSeqStartPos.size(); if (nSeq > 1 and computeMD5) { string md5Str; MakeMD5((const char*) &seq.seq[seqDBPtr->growableSeqStartPos[nSeq-2]], seqDBPtr->growableSeqStartPos[nSeq-1] - seqDBPtr->growableSeqStartPos[nSeq-2] - 1, md5Str); seqDBPtr->md5.push_back(md5Str); } seqDBPtr->Finalize(); } return seq.length; }
int main(int argc, char* argv[]) { string ad1File, ad2File, readsFile, readsOutFile; FASTAReader ad1Reader; FASTAReader ad2Reader; FASTAReader reader; CommandLineParser cl; float minPctSimilarity = 0.60; int indel = 3; int minLength = 10; cl.RegisterStringOption("ad1", &ad1File, "FASTA file with the first adapter"); cl.RegisterStringOption("ad2", &ad2File, "FASTA file with the second adapter"); cl.RegisterStringOption("reads", &readsFile, "FASTA file with SMRTBell reads"); cl.RegisterStringOption("readsout", &readsOutFile, "output file for split reads"); cl.RegisterPreviousFlagsAsHidden(); cl.RegisterFloatOption("pctSim", &minPctSimilarity, "Minimum percent similarity to trigger a match to an adapter.", CommandLineParser::PositiveFloat); cl.RegisterIntOption("indel", &indel, "Penalty for indel (positive)", CommandLineParser::NonNegativeInteger); cl.RegisterIntOption("minLength", &minLength, "Minimum length pass to retain.", CommandLineParser::PositiveInteger); vector<string> opts; cl.ParseCommandLine(argc, argv, opts); /* * Open all the required files, quitting if they are unavailable. */ ad1Reader.Init(ad1File); ad2Reader.Init(ad2File); reader.Init(readsFile); ofstream splitOut; CrucialOpen(readsOutFile, splitOut); FASTASequence ad1, ad2; ad1Reader.GetNext(ad1); ad2Reader.GetNext(ad2); FASTASequence read; vector<int> scoreMat; vector<Arrow> pathMat; int readIndex = 0; while(reader.GetNext(read)) { read.ToUpper(); // // Do a fitting sequence alignment to match one of the two // adapters into the read. // vector<int> passStarts, passLengths, la; read.PrintSeq(cout); SplitRead(read, 0, read.length, ad1, ad2, indel, passStarts, passLengths,la, 0, scoreMat, pathMat, minPctSimilarity, minLength); int i; for (i = 0; i < passStarts.size(); i++) { cout << "read: " << readIndex << " pass: "******" " << passStarts[i] << " " << passLengths[i] << " " << la[i] << endl; } ++readIndex; } }
int main(int argc, char* argv[]) { CommandLineParser clp; string refGenomeName; string mutGenomeName; string gffFileName; float insRate = 0; float delRate = 0; float mutRate = 0; bool lower = false; gffFileName = ""; clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true); clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome."); clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false); vector<string> leftovers; clp.ParseCommandLine(argc, argv, leftovers); FASTAReader reader; FASTASequence refGenome; reader.Init(refGenomeName); ofstream mutGenomeOut; CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out); ofstream gffOut; if (gffFileName != "") { CrucialOpen(gffFileName, gffOut, std::ios::out); } vector<int> insIndices, delIndices, subIndices; int readIndex = 0; InitializeRandomGeneratorWithTime(); while (reader.GetNext(refGenome)) { insIndices.resize(refGenome.length); delIndices.resize(refGenome.length); subIndices.resize(refGenome.length); std::fill(insIndices.begin(), insIndices.end(), false); std::fill(delIndices.begin(), delIndices.end(), false); std::fill(subIndices.begin(), subIndices.end(), 0); enum ChangeType { Ins, Del, Mut, None}; float changeProb[4]; changeProb[Ins] = insRate; changeProb[Del] = changeProb[Ins] + delRate; changeProb[Mut] = changeProb[Del] + mutRate; changeProb[None] = 1; if (changeProb[Mut] > 1) { cout << "ERROR! The sum of the error probabilities must be less than 1" << endl; exit(1); } DNALength pos; float randomNumber; int numIns = 0; int numDel = 0; int numMut = 0; for (pos =0 ; pos < refGenome.length; pos++) { randomNumber = Random(); if (randomNumber < changeProb[Ins]) { insIndices[pos] = true; numIns++; } else if (randomNumber < changeProb[Del]) { delIndices[pos] = true; numDel++; } else if (randomNumber < changeProb[Mut]){ Nucleotide newNuc = TwoBitToAscii[RandomInt(4)]; int maxIts = 100000; int it = 0; while (newNuc == refGenome.seq[pos]) { newNuc = TwoBitToAscii[RandomInt(4)]; if (it == maxIts) { cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl; exit(1); } } subIndices[pos] = refGenome[pos]; refGenome.seq[pos] = ToLower(newNuc,lower); ++numMut; } } // cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl; if (readIndex % 100000 == 0 && readIndex > 0) { cout << readIndex << endl; } // // Now add the insertions and deletions. // FASTASequence newSequence; DNALength newPos; if (numIns - numDel + refGenome.length < 0) { cout << "ERROR, the genome has been deleted to nothing." << endl; exit(1); } ResizeSequence(newSequence, refGenome.length + (numIns - numDel)); newPos = 0; pos = 0; for (pos = 0; pos < refGenome.length; pos++) { assert(newPos < newSequence.length or delIndices[pos] == true); if (subIndices[pos] != 0 and gffFileName != "") { gffOut << refGenome.GetName() << " . SNV " << newPos << " " << newPos <<" 0.00 . . reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl; } if (insIndices[pos] == true) { newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower); newPos++; newSequence.seq[newPos] = refGenome.seq[pos]; assert(newSequence.seq[newPos] != '1'); assert(newSequence.seq[newPos] != 1); if (gffFileName != "") { gffOut << refGenome.GetName() << " . deletion " << newPos << " " << newPos << " 0.00 . . reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl; } newPos++; } else if (delIndices[pos] == true) { // no-op, skip if (gffFileName != "") { gffOut << refGenome.GetName() << " . insertion " << newPos << " " << newPos << " 0.00 . . confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl; //ref000001 . deletion 20223 20223 0.00 . . reference=T;length=1;confidence=0;coverage=0;Name=20222delT } } else { newSequence.seq[newPos] = refGenome.seq[pos]; newPos++; } } stringstream titlestrm; titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate; newSequence.CopyTitle(refGenome.title); newSequence.AppendToTitle(titlestrm.str()); newSequence.PrintSeq(mutGenomeOut); newSequence.Free(); readIndex++; } }
int main(int argc, char* argv[]) { string genomeFileName; string suffixArrayFileName; if (argc < 4) { cout << "Usage: printWordCount genome suffixArray k [k2 k3 k4...]" << endl; exit(1); } genomeFileName = argv[1]; suffixArrayFileName = argv[2]; int argi = 3; vector<DNALength> k; while (argi < argc) { k.push_back(atoi(argv[argi])); argi++; } // Get the ref sequence. FASTAReader reader; reader.Init(genomeFileName); FASTASequence seq; // reader.GetNext(seq); reader.ReadAllSequencesIntoOne(seq); seq.ToUpper(); // Get the suffix array. DNASuffixArray sarray; sarray.Read(suffixArrayFileName); int ki; char *word; cout << "wordlen word nword" << endl; for (ki = 0; ki < k.size(); ki++) { word = new char[k[ki]+1]; word[k[ki]] = '\0'; DNALength i; DNALength numUnique = 0; for (i = 0; i < seq.length - k[ki] - 1; ) { DNALength j = i + 1; bool seqAtN = false; int si; for(si = 0; si < k[ki]; si++) { if (seq.seq[sarray.index[i] + si] == 'N') { seqAtN = true; break; } } if (seqAtN) { i++; continue; } while (j < seq.length - k[ki] and seq.length - sarray.index[i] >= k[ki] and seq.length - sarray.index[j] >= k[ki] and strncmp((const char*) &seq.seq[sarray.index[i]], (const char*) &seq.seq[sarray.index[j]], k[ki]) == 0) { j++; } if (seq.length - sarray.index[i] >= k[ki]) { for(si = 0; si < k[ki]; si++) { word[si] = seq.seq[sarray.index[i]+si]; } cout << k[ki] << " " << word << " " << j - i + 1 << endl; if (j == i + 1) { ++numUnique; } } i = j; } } }
int main(int argc, char* argv[]) { string gencodeGffFileName, genomeFileName, genesOutFileName; string geneType = "protein_coding"; bool randomSplicing = false; int numRandomSplicing = 1; float pSkip = 0.5; if (argc < 4) { cout << "Usage: extractGenes gencodeGTFFile genomeFile genesOutFileName [-geneType type (protein_coding)] [-randomSplicing] [-numRandomSplicing n] [-pSkip prob (0-1, default:0.5)]" << endl; exit(1); } gencodeGffFileName = argv[1]; genomeFileName = argv[2]; genesOutFileName = argv[3]; int argi = 4; string coordinatesFileName; while (argi < argc) { if (strcmp(argv[argi], "-geneType") == 0) { geneType = argv[++argi]; } else if (strcmp(argv[argi], "-randomSplicing") == 0) { randomSplicing = true; } else if (strcmp(argv[argi], "-numRandomSplicing") == 0) { numRandomSplicing = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-pSkip") == 0) { pSkip = atof(argv[++argi]); } else { cout << "ERROR, bad option " << argv[argi] << endl; exit(1); } ++argi; } coordinatesFileName = genesOutFileName; coordinatesFileName.append(".pos"); FASTAReader reader; reader.Initialize(genomeFileName); ofstream outFile, coordsFile; CrucialOpen(genesOutFileName, outFile, std::ios::out); string coordsFileName = genesOutFileName + ".coords"; CrucialOpen(coordsFileName, coordsFile, std::ios::out); vector<FASTASequence> referenceSequences; reader.ReadAllSequences(referenceSequences); int i; map<string, int> titleToIndex; for (i = 0; i < referenceSequences.size(); i++) { titleToIndex[referenceSequences[i].title] = i; } GencodeGFFFile gencodeFile; gencodeFile.ReadAll(gencodeGffFileName); vector<GencodeGFFGene> genes; IndexGencodeGenes(gencodeFile, genes, geneType); for (i = 0; i < genes.size(); i++) { genes[i].OrderExonsByStart(); } int e; for (i = 0; i < genes.size(); i++) { FASTASequence geneSequence; geneSequence.CopyTitle(genes[i].geneName); if (titleToIndex.find(genes[i].chromosome) == titleToIndex.end()) { continue; } int chrIndex = titleToIndex[genes[i].chromosome]; string sequence = ""; // // Do nothing with 0 length exons. // if (genes[i].exons.size() == 0) { continue; } vector<FASTASequence> geneSequences; vector<GeneCoordinates> geneCoordinates; genes[i].GenerateGeneSequences(referenceSequences[chrIndex], geneSequences, geneCoordinates, randomSplicing); int gi; for (gi = 0; gi < geneSequences.size(); gi++) { if (genes[i].GetStrand() == '+') { geneSequences[gi].PrintSeq(outFile); } else { FASTASequence rc; geneSequences[gi].MakeRC(rc); rc.PrintSeq(outFile); rc.Free(); } coordsFile << geneSequences[gi].title << " " << geneCoordinates[gi].chromosome << " " << geneCoordinates[gi].exonCoordinates.size() << " " << geneCoordinates[gi].strand; int i; for (i = 0; i < geneCoordinates[gi].exonCoordinates.size(); i++) { coordsFile << " " << geneCoordinates[gi].exonCoordinates[i].start << " " << geneCoordinates[gi].exonCoordinates[i].end << " "; } coordsFile << endl; geneSequences[gi].Free(); } // // No need to free the seq, since it is controlled by the string. // } coordsFile.close(); }
int main(int argc, char* argv[]) { string refGenomeFileName = ""; string lengthModelFileName = ""; string outputModelFileName = ""; DNALength numBasesPerFile = 0; string sourceReadsFileName = ""; string titleTableFileName = ""; int numBasH5Files = 1; string basH5BaseFileName = "simulated"; string movieName = "m101211_092754_00114_cSIM_s1_p0"; bool doRandGenInit = true; bool usePosMap = false; bool printPercentRepeat = false; string posMapFileName = ""; vector<string> movieNames; bool useLengthModel = false; bool useFixedLength = false; ofstream posMapFile; int scaledLength = 0; int fixedLength = 0; int nBasFiles = 1; bool useLengthsModel = true; bool printHelp = false; // Look to see if the refAsReads flag is specified anywhere before // parsing the command line. CommandLineParser clp; string commandLine; string helpString; SetHelp(helpString); vector<string> fns; clp.RegisterStringOption("genome", &refGenomeFileName, ""); clp.RegisterIntOption("numBasesPerFile", (int*)&numBasesPerFile, "", CommandLineParser::PositiveInteger); clp.RegisterStringOption("sourceReads", &sourceReadsFileName, ""); clp.RegisterStringOption("lengthModel", &lengthModelFileName, ""); clp.RegisterIntOption("fixedLength", &fixedLength, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("lengthModel", &useLengthModel, ""); clp.RegisterStringOption("movieName", &movieName, ""); clp.RegisterStringOption("titleTable", &titleTableFileName, ""); clp.RegisterStringOption("baseFileName", &basH5BaseFileName, ""); clp.RegisterIntOption("nFiles", &nBasFiles, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("meanLength", &scaledLength, "", CommandLineParser::PositiveInteger); clp.RegisterStringOption("posMap", &posMapFileName, ""); clp.RegisterFlagOption("printPercentRepeat", &printPercentRepeat, ""); clp.RegisterFlagOption("h", &printHelp, ""); clp.SetHelp(helpString); clp.ParseCommandLine(argc, argv, fns); clp.CommandLineToString(argc, argv, commandLine); clp.SetProgramName("alchemy"); outputModelFileName = fns[0]; if (argc <= 1 or printHelp or outputModelFileName == "") { cout << helpString << endl; exit(0); } if (usePosMap) { CrucialOpen(posMapFileName, posMapFile, std::ios::out); } if (sourceReadsFileName == "" and fixedLength == 0) { useLengthModel = true; } if (useLengthModel and fixedLength != 0) { cout << "ERROR! You must either use a length model or a fixed length." << endl; exit(1); } if (sourceReadsFileName == "" and numBasesPerFile == 0) { cout << "ERROR! You must specify either a set of read to use as " << endl << "original reads for simulation or the total number of bases " << endl << "to simulate in each bas.h5 file." << endl; exit(1); } if (sourceReadsFileName == "" and refGenomeFileName == "") { cout << "ERROR! You must specify a genome to sample reads from or a set of read "<<endl << "to use as original reads for simulation." << endl; exit(1); } if (fixedLength != 0 and refGenomeFileName == "") { cout << "ERROR! You must specify a genome file if using a fixed length." << endl; exit(1); } if ((fixedLength != 0 or scaledLength != 0) and sourceReadsFileName != "") { cout << "ERROR! You cannot specify a fixed length nor mean length with a source " << endl << "reads file. The read lengths are taken from the source reads or the length model." << endl; exit(1); } LengthHistogram lengthHistogram; OutputSampleListSet outputModel(0); TitleTable titleTable; if (doRandGenInit) { InitializeRandomGeneratorWithTime(); } // // Read models. // if (titleTableFileName != "") { titleTable.Read(titleTableFileName); } outputModel.Read(outputModelFileName); if (useLengthModel) { lengthHistogram.BuildFromAlignmentLengths(outputModel.lengths); } vector<int> alignmentLengths; int meanAlignmentLength; if (scaledLength != 0 and useLengthModel) { // // Scale the histogram so that the average length is 'scaledLength'. // // 1. Integrate histogram long totalLength = 0; long totalSamples = 0; int hi; for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size()-1; hi++) { int ni; ni = lengthHistogram.lengthHistogram.cdf[hi+1] - lengthHistogram.lengthHistogram.cdf[hi]; totalLength += ni * lengthHistogram.lengthHistogram.data[hi]; } totalSamples = lengthHistogram.lengthHistogram.cdf[lengthHistogram.lengthHistogram.cdf.size()-1]; float meanSampleLength = totalLength / (1.0*totalSamples); float fractionIncrease = scaledLength / meanSampleLength; for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size(); hi++) { lengthHistogram.lengthHistogram.data[hi] *= fractionIncrease; } } FASTAReader inReader, seqReader; vector<FASTASequence> reference; DNALength refLength = 0; int i; if (refGenomeFileName != "") { inReader.Init(refGenomeFileName); inReader.ReadAllSequences(reference); for (i = 0; i < reference.size(); i++) { refLength += reference[i].length; } } if (sourceReadsFileName != "") { seqReader.Init(sourceReadsFileName); } ofstream readsFile; // // Create and simulate bas.h5 files. // int baseFileIndex; bool readsRemain = true; for (baseFileIndex = 0; ((sourceReadsFileName == "" and baseFileIndex < nBasFiles) // case 1 is reads are generated by file or (sourceReadsFileName != "" and readsRemain)); // case 2 is reads are generated by an input file. baseFileIndex++) { // // Prep the base file for writing. // stringstream fileNameStrm, movieNameStrm; //string movieName = "m000000_000000_00000_cSIMULATED_s"; movieNameStrm << movieName << baseFileIndex << "_p0"; string fullMovieName = movieNameStrm.str(); fileNameStrm << fullMovieName << ".bas.h5"; HDFBasWriter basWriter; HDFRegionTableWriter regionWriter; // // This is mainly used to create the atributes. // RegionTable regionTable; regionTable.CreateDefaultAttributes(); basWriter.SetPlatform(Springfield); // // Use a fixed set of fields for now. // // These are all pulled from the outputModel. basWriter.IncludeField("Basecall"); basWriter.IncludeField("QualityValue"); basWriter.IncludeField("SubstitutionQV"); basWriter.IncludeField("SubstitutionTag"); basWriter.IncludeField("InsertionQV"); basWriter.IncludeField("DeletionQV"); basWriter.IncludeField("DeletionTag"); basWriter.IncludeField("WidthInFrames"); basWriter.IncludeField("PreBaseFrames"); basWriter.IncludeField("PulseIndex"); vector<unsigned char> qualityValue, substitutionQV, substitutionTag, insertionQV, deletionQV, deletionTag; vector<HalfWord> widthInFrames, preBaseFrames, pulseIndex; // Just go from 0 .. hole Number basWriter.IncludeField("HoleNumber"); // Fixed to 0. basWriter.IncludeField("HoleXY"); if (usePosMap == false) { basWriter.IncludeField("SimulatedSequenceIndex"); basWriter.IncludeField("SimulatedCoordinate"); } basWriter.SetChangeListID("1.3.0.50.104380"); DNALength numSimulatedBases = 0; FASTASequence sampleSeq; //sampleSeq.length = readLength; int maxRetry = 10000000; int retryNumber = 0; int numReads = 0; int readLength = 0; while (numBasesPerFile == 0 or numSimulatedBases < numBasesPerFile) { DNALength seqIndex, seqPos; if (useLengthModel or fixedLength) { if (useLengthModel) { lengthHistogram.GetRandomLength(readLength); } else { readLength = fixedLength; } } if (refGenomeFileName != "") { FindRandomPos(reference, seqIndex, seqPos, readLength + (outputModel.keyLength - 1)); sampleSeq.seq = &reference[seqIndex].seq[seqPos]; sampleSeq.length = readLength + (outputModel.keyLength - 1); assert(reference[seqIndex].length >= sampleSeq.length); } else if (sourceReadsFileName != "") { if (seqReader.GetNext(sampleSeq) == false) { readsRemain = false; break; } if (sampleSeq.length < outputModel.keyLength) { continue; } // // Now attempt to parse the position from the fasta title. // if (useLengthModel) { int tryNumber = 0; readLength = 0; int maxNTries = 1000; int tryBuffer[5] = {-1,-1,-1,-1,-1}; while (tryNumber < maxNTries and readLength < outputModel.keyLength) { lengthHistogram.GetRandomLength(readLength); readLength = sampleSeq.length = min(sampleSeq.length, (unsigned int) readLength); tryBuffer[tryNumber%5] = readLength; tryNumber++; } if (tryNumber >= maxNTries) { cout << "ERROR. Could not generate a read length greater than the " << outputModel.keyLength << " requried " <<endl << "minimum number of bases using the length model specified in the alchemy." <<endl << "model. Something is either wrong with the model or the context length is too large." <<endl; cout << "The last few tries were: " << tryBuffer[0] << " " << tryBuffer[1] << " " << tryBuffer[2] << " " << tryBuffer[3] << " " << tryBuffer[4] << endl; exit(1); } } readLength = sampleSeq.length; vector<string> tokens; Tokenize(sampleSeq.title, "|", tokens); if (tokens.size() == 4) { seqPos = atoi(tokens[2].c_str()); if (titleTableFileName == "") { seqIndex = 0; } else { int index; titleTable.Lookup(tokens[1], index); seqIndex = index; } } else { seqPos = 0; } } // // If this is the first read printed to the base file, initialize it. // if (numSimulatedBases == 0) { basWriter.Initialize(fileNameStrm.str(), movieNameStrm.str(), Springfield); regionWriter.Initialize(basWriter.pulseDataGroup); } numSimulatedBases += readLength; int p; // create the sample sequence int contextLength = outputModel.keyLength; int contextMiddle = contextLength / 2; string outputString; int nDel = 0; int nIns = 0; // // Simulate to beyond the sample length. // qualityValue.clear(); substitutionQV.clear(); substitutionTag.clear(); insertionQV.clear(); deletionQV.clear(); deletionTag.clear(); pulseIndex.clear(); widthInFrames.clear(); preBaseFrames.clear(); assert(sampleSeq.length > contextMiddle + 1); for (p = contextMiddle; p < sampleSeq.length - contextMiddle - 1; p++) { string refContext; refContext.assign((const char*) &sampleSeq.seq[p-contextMiddle], contextLength); string outputContext; int contextWasFound; OutputSample sample; int i; for (i = 0; i < refContext.size(); i++) { refContext[i] = toupper(refContext[i]); } outputModel.SampleRandomSample(refContext, sample); if (sample.type == OutputSample::Deletion ) { // // There was a deletion. Advance in reference, then output // the base after the deletion. // p++; ++nDel; } int cp; // // Add the sampled context, possibly multiple characters because of an insertion. // for (i = 0; i < sample.nucleotides.size(); i++) { outputString.push_back(sample.nucleotides[i]); qualityValue.push_back(sample.qualities[i].qv[0]); deletionQV.push_back(sample.qualities[i].qv[1]); insertionQV.push_back(sample.qualities[i].qv[2]); substitutionQV.push_back(sample.qualities[i].qv[3]); deletionTag.push_back(sample.qualities[i].tags[0]); substitutionTag.push_back(sample.qualities[i].tags[1]); pulseIndex.push_back(sample.qualities[i].frameValues[0]); preBaseFrames.push_back(sample.qualities[i].frameValues[1]); widthInFrames.push_back(sample.qualities[i].frameValues[2]); } nIns += sample.qualities.size() - 1; } if (outputString.find('N') != outputString.npos or outputString.find('n') != outputString.npos) { cout << "WARNING! The sampled string " << endl << outputString << endl << "should not contain N's, but it seems to. This is being ignored "<<endl << "for now so that simulation may continue, but this shouldn't happen"<<endl << "and is really a bug." << endl; numSimulatedBases -= readLength; continue; } // // Ok, done creating the read, now time to create some quality values!!!!! // SMRTSequence read; read.length = outputString.size(); read.Allocate(read.length); memcpy(read.seq, outputString.c_str(), read.length * sizeof(unsigned char)); assert(qualityValue.size() == read.length * sizeof(unsigned char)); memcpy(read.qual.data, &qualityValue[0], read.length * sizeof(unsigned char)); memcpy(read.deletionQV.data, &deletionQV[0], read.length * sizeof(unsigned char)); memcpy(read.insertionQV.data, &insertionQV[0], read.length * sizeof(unsigned char)); memcpy(read.substitutionQV.data, &substitutionQV[0], read.length * sizeof(unsigned char)); memcpy(read.deletionTag, &deletionTag[0], read.length * sizeof(unsigned char)); memcpy(read.substitutionTag, &substitutionTag[0], read.length * sizeof(unsigned char)); memcpy(read.pulseIndex, &pulseIndex[0], read.length * sizeof(int)); memcpy(read.preBaseFrames, &preBaseFrames[0], read.length * sizeof(HalfWord)); memcpy(read.widthInFrames, &widthInFrames[0], read.length * sizeof(HalfWord)); // // The pulse index for now is just fake data. // int i; for (i = 0; i < read.length; i++) { read.pulseIndex[i] = 1; } read.xy[0] = seqIndex; read.xy[1] = seqPos; read.zmwData.holeNumber = numReads; basWriter.Write(read); // Record where this was simulated from. if (usePosMap == false) { basWriter.WriteSimulatedCoordinate(seqPos); basWriter.WriteSimulatedSequenceIndex(seqIndex); } else { posMapFile << fullMovieName << "/" << numReads << "/0_" << read.length << " " << seqIndex << " "<< seqPos; if (printPercentRepeat) { DNALength nRepeat = sampleSeq.GetRepeatContent(); posMapFile << " " << nRepeat*1.0/sampleSeq.length; } posMapFile << endl; } RegionAnnotation region; region.row[0] = read.zmwData.holeNumber; region.row[1] = 1; region.row[2] = 0; region.row[3] = read.length; region.row[4] = 1000; // Should be enough. regionWriter.Write(region); region.row[1] = 2; // Rewrite for hq region encompassing everything. regionWriter.Write(region); if (sourceReadsFileName != "") { sampleSeq.Free(); } read.Free(); ++numReads; } regionWriter.Finalize(regionTable.columnNames, regionTable.regionTypes, regionTable.regionDescriptions, regionTable.regionSources); basWriter.Close(); numReads = 0; // // The bas writer should automatically flush on closing. // } if (usePosMap) { posMapFile.close(); } for (i = 0; i < reference.size(); i++) { reference[i].Free(); } }
void FASTASequence::MakeRC(FASTASequence &rhs, DNALength rhsPos, DNALength rhsLength) { DNASequence::MakeRC((DNASequence&) rhs, rhsPos, rhsLength); if (title != NULL) { rhs.CopyTitle(title); } }
int main(int argc, char* argv[1]) { if (argc < 3) { cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl; cout << " genome.fasta.sa must exist." << endl; cout << " Finds sequences at least effective_k in length that are unique." << endl; cout << " -max m Allow up to m matches" << endl; cout << " -minLength l Ensure the length of the match is at least this." << endl; cout << " -prefix p n Allow up to n matches across a prefix of length p" << endl; cout << " -suffix s n Allow up to n matches across a suffix of length s" << endl; cout << " Prefix and suffix options override max." << endl; cout << " -out file Print queries to this output file (query.fasta.queries)" << endl; exit(0); } DNASuffixArray sarray; string genomeFileName = argv[1]; string suffixArrayFileName = genomeFileName + ".sa"; FASTAReader reader; FASTASequence genome; int maxN = 0; int prefix = 0; int suffix = 0; int prefixN = 0; int suffixN = 0; int argi = 4; string outputFileName = ""; int minLength = 0; while (argi < argc) { if (strcmp(argv[argi], "-max") == 0) { ++argi; maxN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-prefix") == 0) { ++argi; prefix = atoi(argv[argi]); ++argi; prefixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-suffix") == 0) { ++argi; suffix = atoi(argv[argi]); ++argi; suffixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-out") == 0) { ++argi; outputFileName = argv[argi]; } else if (strcmp(argv[argi], "-minLength") == 0) { ++argi; minLength = atoi(argv[argi]); } ++argi; } reader.Initialize(genomeFileName); reader.ReadAllSequencesIntoOne(genome); sarray.Read(suffixArrayFileName); FASTAReader queryReader; FASTASequence querySequence; string queryFileName = argv[2]; int maxLength = atoi(argv[3]); string summaryTableFileName = queryFileName + ".summary"; if (outputFileName == "") { outputFileName = queryFileName + ".queries"; } ofstream summaryTable(summaryTableFileName.c_str()); ofstream outputFile(outputFileName.c_str()); queryReader.Initialize(queryFileName); while (queryReader.GetNext(querySequence)) { int i; cerr << "searching " << querySequence.title << endl; if (querySequence.length < maxLength) { continue; } int nMatches = 0; querySequence.ToUpper(); int localMax; for (i = 0; i < querySequence.length - maxLength + 1; i++) { if ((i + 1) % 100000 == 0) { cerr << "processed: " << i + 1 << endl; } int lcpLength; vector<SAIndex> lcpLeftBounds, lcpRightBounds; vector<SAIndex> rclcpLeftBounds, rclcpRightBounds; localMax = maxN; if (i < prefix) { localMax = prefixN; } if (i >= querySequence.length - suffix) { localMax = suffixN; } if (querySequence.length - i <= maxLength) { continue; } if (querySequence.seq[i] == 'N') { continue; } lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. &querySequence.seq[i], querySequence.length-i, true, maxLength, lcpLeftBounds, lcpRightBounds, false); if (lcpLength < minLength) { continue; } if (lcpLength < maxLength or lcpRightBounds.size() == 0 or (lcpRightBounds.size() > 0 and lcpLeftBounds.size() > 0 and lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) { FASTASequence rc; DNASequence subseq; subseq.ReferenceSubstring(querySequence, i, maxLength); subseq.MakeRC(rc); int rclcpLength; int numForwardMatches; if (lcpLength == 0) { numForwardMatches = 0; } else { numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1]; } rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. rc.seq, maxLength, true, rclcpLength, rclcpLeftBounds, rclcpRightBounds, false); string rcstr((const char*)rc.seq, rc.length); if (rclcpLength < maxLength or rclcpRightBounds.size() == 0 or (numForwardMatches + rclcpRightBounds[rclcpRightBounds.size() - 1] - rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) { char* substr = new char[maxLength+1]; substr[maxLength] = '\0'; memcpy(substr, &querySequence.seq[i], maxLength); // string substr = string((const char*) querySequence.seq, i, maxLength); outputFile << querySequence.title << "\t" << substr << "\t" << i << endl; ++nMatches; delete[] substr; // } } rc.Free(); } } summaryTable << querySequence.title << "\t" << nMatches << endl; querySequence.Free(); } outputFile.close(); genome.Free(); }
int main(int argc, char* argv[]) { string barcodeFileName, insertFileName, outputFileName; if (argc != 4) { cout << "usage: makeBarcodeDatabase insert.fasta barcodes.fasta output.fasta" << endl; exit(1); } insertFileName = argv[1]; barcodeFileName = argv[2]; outputFileName = argv[3]; FASTAReader barcodeReader, insertReader; barcodeReader.Initialize(barcodeFileName); insertReader.Initialize(insertFileName); ofstream barcodedOut; CrucialOpen(outputFileName, barcodedOut, std::ios::out); vector<FASTASequence> forwardBarcodes, reverseBarcodes; FASTASequence barcodeSequence, reverseBarcodeSequence; while(barcodeReader.GetNext(barcodeSequence)) { forwardBarcodes.push_back(barcodeSequence); barcodeSequence.MakeRC(reverseBarcodeSequence); reverseBarcodes.push_back(reverseBarcodeSequence); } FASTASequence insert; insertReader.GetNext(insert); int i; for (i = 0; i < forwardBarcodes.size(); i++) { FASTASequence barcodedInsert; barcodedInsert.Resize(forwardBarcodes[i].length * 2 + insert.length); stringstream titleStrm; titleStrm << insert.title << "|ff|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|fr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &forwardBarcodes[i].seq[0], forwardBarcodes[i].length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[forwardBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rf|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], forwardBarcodes[i].seq, forwardBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); titleStrm.str(""); titleStrm << insert.title << "|rr|" << forwardBarcodes[i].title; barcodedInsert.CopyTitle(titleStrm.str()); memcpy(&barcodedInsert.seq[0], &reverseBarcodes[i].seq[0], reverseBarcodes[i].length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length], insert.seq, insert.length); memcpy(&barcodedInsert.seq[reverseBarcodes[i].length + insert.length], reverseBarcodes[i].seq, reverseBarcodes[i].length); barcodedInsert.PrintSeq(barcodedOut); } }
int main(int argc, char* argv[]) { if (argc < 2) { PrintUsage(); exit(1); } int argi = 1; string saFile = argv[argi++]; vector<string> inFiles; int doBLT = 1; int bltPrefixLength = 8; int parsingOptions = 0; SAType saBuildType = larsson; int read4BitCompressed = 0; int diffCoverSize = 0; while (argi < argc) { if (strlen(argv[argi]) > 0 and argv[argi][0] == '-'){ parsingOptions = 1; } if (!parsingOptions) { inFiles.push_back(argv[argi]); } else { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; if (argi < argc - 1) { bltPrefixLength = atoi(argv[++argi]); if (bltPrefixLength == 0) { cout << argv[argi] << " is not a valid lookup table length." << endl; exit(1); } } else { cout << "Please specify a lookup table length." << endl; exit(1); } } else if (strcmp(argv[argi], "-mamy") == 0) { saBuildType = manmy; } else if (strcmp(argv[argi], "-larsson") == 0) { saBuildType = larsson; } else if (strcmp(argv[argi], "-mcilroy") == 0) { saBuildType = mcilroy; } else if (strcmp(argv[argi], "-slow") == 0) { saBuildType = slow; } else if (strcmp(argv[argi], "-kark") == 0) { saBuildType = kark; } else if (strcmp(argv[argi], "-mafe") == 0) { saBuildType = mafe; } else if (strcmp(argv[argi], "-welter") == 0) { saBuildType = welter; } else if (strcmp(argv[argi], "-welterweight") == 0) { if (argi < argc-1) { diffCoverSize = atoi(argv[++argi]); } else { cout << "Please specify a difference cover size. Valid values are 7,32,64,111, and 2281. Larger values use less memory but may be slower." << endl; exit(1); } if ( ! (diffCoverSize == 7 or diffCoverSize == 32 or diffCoverSize == 64 or diffCoverSize == 111 or diffCoverSize == 2281) ) { cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl; cout << "Larger numbers use less space but are more slow." << endl; exit(1); } } else if (strcmp(argv[argi], "-4bit") == 0) { read4BitCompressed = 1; } else { PrintUsage(); cout << "ERROR, bad option: " << argv[argi] << endl; exit(1); } } ++argi; } if (inFiles.size() == 0) { // // Special use case: the input file is a fasta file. Write to that file + .sa // inFiles.push_back(saFile); saFile = saFile + ".sa"; } VectorIndex inFileIndex; FASTASequence seq; CompressedSequence<FASTASequence> compSeq; if (read4BitCompressed == 0) { for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) { FASTAReader reader; reader.Init(inFiles[inFileIndex]); reader.SetSpacePadding(111); if (saBuildType == kark) { // // The Karkkainen sa building method requires a little extra // space at the end of the dna sequence so that counting may // be done mod 3 without adding extra logic for boundaries. // } if (inFileIndex == 0) { reader.ReadAllSequencesIntoOne(seq); reader.Close(); } else { while(reader.ConcatenateNext(seq)) { cout << "added " << seq.title << endl; } } } seq.ToThreeBit(); //seq.ToUpper(); } else { assert(inFiles.size() == 1); cout << "reading compressed sequence." << endl; compSeq.Read(inFiles[0]); seq.seq = compSeq.seq; seq.length = compSeq.length; compSeq.RemoveCompressionCounts(); cout << "done." << endl; } // // For now, do not allow creation of suffix arrays on sequences > 4G. // if (seq.length >= UINT_MAX) { cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl; cout << "Consider breaking the reference into multiple files, running alignment. " << endl; cout << "against each file, and merging the result." << endl; exit(1); } vector<int> alphabet; SuffixArray<Nucleotide, vector<int> > sa; // sa.InitTwoBitDNAAlphabet(alphabet); // sa.InitAsciiCharDNAAlphabet(alphabet); sa.InitThreeBitDNAAlphabet(alphabet); if (saBuildType == manmy) { sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == mcilroy) { sa.index = new SAIndex[seq.length+1]; DNALength i; for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;} sa.index[seq.length] = 0; ssort(sa.index, NULL); for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];}; sa.length = seq.length; } else if (saBuildType == larsson) { sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == kark) { sa.index = new SAIndex[seq.length]; seq.ToThreeBit(); DNALength p; for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; } KarkkainenBuildSuffixArray<Nucleotide>(seq.seq, sa.index, seq.length, 5); sa.length = seq.length; } else if (saBuildType == mafe) { // sa.MaFeBuildSuffixArray(seq.seq, seq.length); } else if (saBuildType == welter) { if (diffCoverSize == 0) { sa.LightweightBuildSuffixArray(seq.seq, seq.length); } else { sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize); } } if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } sa.Write(saFile); return 0; }