int main(int argc, char* argv[]) { if (argc < 4) { PrintUsage(); exit(1); } int argi = 1; string saInFile = argv[argi++]; string genomeFileName = argv[argi++]; string saOutFile = argv[argi++]; vector<string> inFiles; int doBLT = 0; int doBLCP = 0; int bltPrefixLength = 0; int lcpLength = 0; int parsingOptions = 0; while (argi < argc) { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; bltPrefixLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-blcp") == 0) { doBLCP = 1; lcpLength = atoi(argv[++argi]); } else { PrintUsage(); cout << "Bad option: " << argv[argi] << endl; exit(1); } ++argi; } // // Read the suffix array to modify. // DNASuffixArray sa; sa.Read(saInFile); FASTAReader reader; reader.Initialize(genomeFileName); FASTASequence seq; reader.ReadAllSequencesIntoOne(seq); if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } if (doBLCP) { cout << "LCP Table not yet implemented." << endl; } sa.Write(saOutFile); }
int main(int argc, char* argv[1]) { if (argc < 3) { cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl; cout << " genome.fasta.sa must exist." << endl; cout << " Finds sequences at least effective_k in length that are unique." << endl; cout << " -max m Allow up to m matches" << endl; cout << " -minLength l Ensure the length of the match is at least this." << endl; cout << " -prefix p n Allow up to n matches across a prefix of length p" << endl; cout << " -suffix s n Allow up to n matches across a suffix of length s" << endl; cout << " Prefix and suffix options override max." << endl; cout << " -out file Print queries to this output file (query.fasta.queries)" << endl; exit(0); } DNASuffixArray sarray; string genomeFileName = argv[1]; string suffixArrayFileName = genomeFileName + ".sa"; FASTAReader reader; FASTASequence genome; int maxN = 0; int prefix = 0; int suffix = 0; int prefixN = 0; int suffixN = 0; int argi = 4; string outputFileName = ""; int minLength = 0; while (argi < argc) { if (strcmp(argv[argi], "-max") == 0) { ++argi; maxN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-prefix") == 0) { ++argi; prefix = atoi(argv[argi]); ++argi; prefixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-suffix") == 0) { ++argi; suffix = atoi(argv[argi]); ++argi; suffixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-out") == 0) { ++argi; outputFileName = argv[argi]; } else if (strcmp(argv[argi], "-minLength") == 0) { ++argi; minLength = atoi(argv[argi]); } ++argi; } reader.Initialize(genomeFileName); reader.ReadAllSequencesIntoOne(genome); sarray.Read(suffixArrayFileName); FASTAReader queryReader; FASTASequence querySequence; string queryFileName = argv[2]; int maxLength = atoi(argv[3]); string summaryTableFileName = queryFileName + ".summary"; if (outputFileName == "") { outputFileName = queryFileName + ".queries"; } ofstream summaryTable(summaryTableFileName.c_str()); ofstream outputFile(outputFileName.c_str()); queryReader.Initialize(queryFileName); while (queryReader.GetNext(querySequence)) { int i; cerr << "searching " << querySequence.title << endl; if (querySequence.length < maxLength) { continue; } int nMatches = 0; querySequence.ToUpper(); int localMax; for (i = 0; i < querySequence.length - maxLength + 1; i++) { if ((i + 1) % 100000 == 0) { cerr << "processed: " << i + 1 << endl; } int lcpLength; vector<SAIndex> lcpLeftBounds, lcpRightBounds; vector<SAIndex> rclcpLeftBounds, rclcpRightBounds; localMax = maxN; if (i < prefix) { localMax = prefixN; } if (i >= querySequence.length - suffix) { localMax = suffixN; } if (querySequence.length - i <= maxLength) { continue; } if (querySequence.seq[i] == 'N') { continue; } lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. &querySequence.seq[i], querySequence.length-i, true, maxLength, lcpLeftBounds, lcpRightBounds, false); if (lcpLength < minLength) { continue; } if (lcpLength < maxLength or lcpRightBounds.size() == 0 or (lcpRightBounds.size() > 0 and lcpLeftBounds.size() > 0 and lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) { FASTASequence rc; DNASequence subseq; subseq.ReferenceSubstring(querySequence, i, maxLength); subseq.MakeRC(rc); int rclcpLength; int numForwardMatches; if (lcpLength == 0) { numForwardMatches = 0; } else { numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1]; } rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. rc.seq, maxLength, true, rclcpLength, rclcpLeftBounds, rclcpRightBounds, false); string rcstr((const char*)rc.seq, rc.length); if (rclcpLength < maxLength or rclcpRightBounds.size() == 0 or (numForwardMatches + rclcpRightBounds[rclcpRightBounds.size() - 1] - rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) { char* substr = new char[maxLength+1]; substr[maxLength] = '\0'; memcpy(substr, &querySequence.seq[i], maxLength); // string substr = string((const char*) querySequence.seq, i, maxLength); outputFile << querySequence.title << "\t" << substr << "\t" << i << endl; ++nMatches; delete[] substr; // } } rc.Free(); } } summaryTable << querySequence.title << "\t" << nMatches << endl; querySequence.Free(); } outputFile.close(); genome.Free(); }
int main(int argc, char* argv[]) { string refFileName, notNormalFileName, normalFileName; if (argc < 4) { cout << "usage: normalizeGCContent ref source dest " << endl << " flips the C/Gs in source randomly until they are the same gc content as ref." << endl; exit(1); } refFileName = argv[1]; notNormalFileName = argv[2]; normalFileName = argv[3]; FASTAReader reader; FASTAReader queryReader; FASTASequence ref; vector<FASTASequence> querySequences; int queryTotalLength; reader.Initialize(refFileName); reader.ReadAllSequencesIntoOne(ref); queryReader.Initialize(notNormalFileName); int refCounts[5], queryCounts[5]; int s; refCounts[0] = refCounts[1] =refCounts[2] = refCounts[3] = refCounts[4] = 0; queryCounts[0] = queryCounts[1] =queryCounts[2] = queryCounts[3] = queryCounts[4] = 0; queryReader.ReadAllSequences(querySequences); ofstream normOut; CrucialOpen(normalFileName, normOut); CountNucs(ref, refCounts); float refGC = (1.0*refCounts[TwoBit['c']] + refCounts[TwoBit['g']]) / (refCounts[TwoBit['a']] + refCounts[TwoBit['c']] + refCounts[TwoBit['g']] + refCounts[TwoBit['t']]); int q; for (q = 0; q < querySequences.size(); q++) { CountNucs(querySequences[q], queryCounts); } float queryGC = (1.0*queryCounts[TwoBit['c']] + queryCounts[TwoBit['g']]) / (queryCounts[TwoBit['a']] + queryCounts[TwoBit['c']] + queryCounts[TwoBit['g']] + queryCounts[TwoBit['t']]); float gcToat = 0.0; float atTogc = 0.0; if (refGC > queryGC) { atTogc = (refGC - queryGC); } else { gcToat = (queryGC - refGC); } DNALength queryGenomeLength = queryCounts[0] + queryCounts[1] + queryCounts[2] + queryCounts[3] + queryCounts[4]; DNALength unmaskedQueryLength = queryCounts[0] + queryCounts[1] + queryCounts[2] + queryCounts[3]; DNALength ngc2at = unmaskedQueryLength * gcToat; DNALength nat2gc = unmaskedQueryLength * atTogc; cout << refGC << " " << queryGC << " " << gcToat << " " << atTogc << " " << ngc2at << " " << nat2gc << endl; vector<FASTASequence> normalized; normalized.resize(querySequences.size()); vector<DNALength> cumLengths; cumLengths.resize(normalized.size()+1); cumLengths[0] = 0; for (q = 0; q < querySequences.size(); q++) { normalized[q] = querySequences[q]; cumLengths[q+1] = cumLengths[q] + querySequences[q].length; } DNALength i; for (i = 0; i < ngc2at; i+=2) { DNALength pos, chr; FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'G', chr, pos); normalized[chr].seq[pos] = 'A'; FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'C', chr, pos); normalized[chr].seq[pos] = 'T'; } for (i = 0; i < nat2gc; i+=2) { DNALength pos, chr; FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'A', chr, pos); normalized[chr].seq[pos] = 'g'; FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'T', chr, pos); normalized[chr].seq[pos] = 'c'; } for (q = 0; q < normalized.size(); q++ ){ normalized[q].PrintSeq(normOut); } }
int main(int argc, char* argv[]) { if (argc < 2) { PrintUsage(); exit(1); } int argi = 1; string saFile = argv[argi++]; vector<string> inFiles; int doBLT = 1; int bltPrefixLength = 8; int parsingOptions = 0; SAType saBuildType = larsson; int read4BitCompressed = 0; int diffCoverSize = 0; while (argi < argc) { if (strlen(argv[argi]) > 0 and argv[argi][0] == '-'){ parsingOptions = 1; } if (!parsingOptions) { inFiles.push_back(argv[argi]); } else { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; if (argi < argc - 1) { bltPrefixLength = atoi(argv[++argi]); if (bltPrefixLength == 0) { cout << argv[argi] << " is not a valid lookup table length." << endl; exit(1); } } else { cout << "Please specify a lookup table length." << endl; exit(1); } } else if (strcmp(argv[argi], "-mamy") == 0) { saBuildType = manmy; } else if (strcmp(argv[argi], "-larsson") == 0) { saBuildType = larsson; } else if (strcmp(argv[argi], "-mcilroy") == 0) { saBuildType = mcilroy; } else if (strcmp(argv[argi], "-slow") == 0) { saBuildType = slow; } else if (strcmp(argv[argi], "-kark") == 0) { saBuildType = kark; } else if (strcmp(argv[argi], "-mafe") == 0) { saBuildType = mafe; } else if (strcmp(argv[argi], "-welter") == 0) { saBuildType = welter; } else if (strcmp(argv[argi], "-welterweight") == 0) { if (argi < argc-1) { diffCoverSize = atoi(argv[++argi]); } else { cout << "Please specify a difference cover size. Valid values are 7,32,64,111, and 2281. Larger values use less memory but may be slower." << endl; exit(1); } if ( ! (diffCoverSize == 7 or diffCoverSize == 32 or diffCoverSize == 64 or diffCoverSize == 111 or diffCoverSize == 2281) ) { cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl; cout << "Larger numbers use less space but are more slow." << endl; exit(1); } } else if (strcmp(argv[argi], "-4bit") == 0) { read4BitCompressed = 1; } else { PrintUsage(); cout << "ERROR, bad option: " << argv[argi] << endl; exit(1); } } ++argi; } if (inFiles.size() == 0) { // // Special use case: the input file is a fasta file. Write to that file + .sa // inFiles.push_back(saFile); saFile = saFile + ".sa"; } VectorIndex inFileIndex; FASTASequence seq; CompressedSequence<FASTASequence> compSeq; if (read4BitCompressed == 0) { for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) { FASTAReader reader; reader.Init(inFiles[inFileIndex]); reader.SetSpacePadding(111); if (saBuildType == kark) { // // The Karkkainen sa building method requires a little extra // space at the end of the dna sequence so that counting may // be done mod 3 without adding extra logic for boundaries. // } if (inFileIndex == 0) { reader.ReadAllSequencesIntoOne(seq); reader.Close(); } else { while(reader.ConcatenateNext(seq)) { cout << "added " << seq.title << endl; } } } seq.ToThreeBit(); //seq.ToUpper(); } else { assert(inFiles.size() == 1); cout << "reading compressed sequence." << endl; compSeq.Read(inFiles[0]); seq.seq = compSeq.seq; seq.length = compSeq.length; compSeq.RemoveCompressionCounts(); cout << "done." << endl; } // // For now, do not allow creation of suffix arrays on sequences > 4G. // if (seq.length >= UINT_MAX) { cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl; cout << "Consider breaking the reference into multiple files, running alignment. " << endl; cout << "against each file, and merging the result." << endl; exit(1); } vector<int> alphabet; SuffixArray<Nucleotide, vector<int> > sa; // sa.InitTwoBitDNAAlphabet(alphabet); // sa.InitAsciiCharDNAAlphabet(alphabet); sa.InitThreeBitDNAAlphabet(alphabet); if (saBuildType == manmy) { sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == mcilroy) { sa.index = new SAIndex[seq.length+1]; DNALength i; for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;} sa.index[seq.length] = 0; ssort(sa.index, NULL); for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];}; sa.length = seq.length; } else if (saBuildType == larsson) { sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == kark) { sa.index = new SAIndex[seq.length]; seq.ToThreeBit(); DNALength p; for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; } KarkkainenBuildSuffixArray<Nucleotide>(seq.seq, sa.index, seq.length, 5); sa.length = seq.length; } else if (saBuildType == mafe) { // sa.MaFeBuildSuffixArray(seq.seq, seq.length); } else if (saBuildType == welter) { if (diffCoverSize == 0) { sa.LightweightBuildSuffixArray(seq.seq, seq.length); } else { sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize); } } if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } sa.Write(saFile); return 0; }
int main(int argc, char* argv[]) { string genomeFileName; string suffixArrayFileName; if (argc < 4) { cout << "Usage: printWordCount genome suffixArray k [k2 k3 k4...]" << endl; exit(1); } genomeFileName = argv[1]; suffixArrayFileName = argv[2]; int argi = 3; vector<DNALength> k; while (argi < argc) { k.push_back(atoi(argv[argi])); argi++; } // Get the ref sequence. FASTAReader reader; reader.Init(genomeFileName); FASTASequence seq; // reader.GetNext(seq); reader.ReadAllSequencesIntoOne(seq); seq.ToUpper(); // Get the suffix array. DNASuffixArray sarray; sarray.Read(suffixArrayFileName); int ki; char *word; cout << "wordlen word nword" << endl; for (ki = 0; ki < k.size(); ki++) { word = new char[k[ki]+1]; word[k[ki]] = '\0'; DNALength i; DNALength numUnique = 0; for (i = 0; i < seq.length - k[ki] - 1; ) { DNALength j = i + 1; bool seqAtN = false; int si; for(si = 0; si < k[ki]; si++) { if (seq.seq[sarray.index[i] + si] == 'N') { seqAtN = true; break; } } if (seqAtN) { i++; continue; } while (j < seq.length - k[ki] and seq.length - sarray.index[i] >= k[ki] and seq.length - sarray.index[j] >= k[ki] and strncmp((const char*) &seq.seq[sarray.index[i]], (const char*) &seq.seq[sarray.index[j]], k[ki]) == 0) { j++; } if (seq.length - sarray.index[i] >= k[ki]) { for(si = 0; si < k[ki]; si++) { word[si] = seq.seq[sarray.index[i]+si]; } cout << k[ki] << " " << word << " " << j - i + 1 << endl; if (j == i + 1) { ++numUnique; } } i = j; } } }