int main(int argc, char* argv[]) { string seqFileName; TupleMetrics tm; string outFileName; if (argc < 3) { cout << "usage: storeTuplePosList seqFile tupleSize outFile" << endl; return 0; } seqFileName = argv[1]; tm.tupleSize = atoi(argv[2]); outFileName = argv[3]; ofstream outFile; // CrucialOpen(outFileName, outFile, std::ios::out| std::ios::binary); FASTAReader reader; reader.Init(seqFileName); FASTASequence seq; reader.GetNext(seq); // vector<PositionDNATuple> TupleList<PositionDNATuple>tuplePosList; tuplePosList.SetTupleMetrics(tm); // StoreTuplePosList(seq, tm, tuplePosList); SequenceToTupleList(seq, tm, tuplePosList); tuplePosList.Sort(); tuplePosList.WriteToFile(outFileName); //WriteTuplePosList(tuplePosList, tm.tupleSize, outFile); outFile.close(); return 0; }
int main(int argc, char* argv[]) { if (argc < 3) { cout << "usage: testRandomSequence genome.fa ntries " << endl; exit(0); } string inFile = argv[1]; int nSamples = atoi(argv[2]); if (nSamples == 0) { return 0; } FASTAReader reader; reader.Initialize(inFile); vector<FASTASequence> genome; reader.ReadAllSequences(genome); int i; cout << "title pos" << endl; for (i = 0; i < nSamples; i++) { DNALength chrIndex, chrPos; FindRandomPos(genome, chrIndex, chrPos); cout << genome[chrIndex].title << " " << chrPos << endl; } return 0; }
int main(int argc, char* argv[]) { string inFileName, outFileName; int length; inFileName = argv[1]; outFileName = argv[2]; length = atoi(argv[3]); int argi = 4; int stride = 0; float coverage = 0; while (argi < argc) { if (strcmp(argv[argi], "-stride")) { stride = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-coverage")) { coverage = atof(argv[++argi]); } ++argi; } FASTAReader reader; reader.Initialize(inFileName); FASTASequence genome; reader.GetNext(genome); if (stride == 0 and coverage == 0) { cout << "ERROR, must provide stride or coverage. " << endl; exit(1); } if (stride == 0) { stride = genome.length * coverage / length; }
int main(int argc, char* argv[]) { if (argc < 3) { cout << "usage: testBuildOccBins genomeFileName suffixArray" << endl; exit(0); } string genomeFileName = argv[1]; string suffixArrayFileName = argv[2]; FASTAReader reader; reader.Init(genomeFileName); FASTASequence seq; reader.GetNext(seq); DNASuffixArray suffixArray; suffixArray.Read(suffixArrayFileName); Bwt<PackedDNASequence, FASTASequence> bwt; //bwt.InitializeFromSuffixArray(seq, suffixArray.index); bwt.InitializeBWTStringFromSuffixArray(seq, suffixArray.index); bwt.occ.Initialize(bwt.bwtSequence, 4096, 64); bwt.occ.PrintBins(cout); }
int main(int argc, char* argv[]) { if (argc < 4) { PrintUsage(); exit(1); } int argi = 1; string saInFile = argv[argi++]; string genomeFileName = argv[argi++]; string saOutFile = argv[argi++]; vector<string> inFiles; int doBLT = 0; int doBLCP = 0; int bltPrefixLength = 0; int lcpLength = 0; int parsingOptions = 0; while (argi < argc) { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; bltPrefixLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-blcp") == 0) { doBLCP = 1; lcpLength = atoi(argv[++argi]); } else { PrintUsage(); cout << "Bad option: " << argv[argi] << endl; exit(1); } ++argi; } // // Read the suffix array to modify. // DNASuffixArray sa; sa.Read(saInFile); FASTAReader reader; reader.Initialize(genomeFileName); FASTASequence seq; reader.ReadAllSequencesIntoOne(seq); if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } if (doBLCP) { cout << "LCP Table not yet implemented." << endl; } sa.Write(saOutFile); }
int main(int argc, char* argv[]) { if (argc < 3) { cout << "usage: bwtLocateList bwtName querySeqFile" << endl; exit(1); } string bwtFileName = argv[1]; string querySeqFileName = argv[2]; bool doPrintResults = false; int maxCount = 0; int argi = 3; bool countOnly = false; while(argi < argc) { if (strcmp(argv[argi], "-print") == 0) { doPrintResults = true; } else if (strcmp(argv[argi], "-max") == 0) { maxCount = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-count") == 0) { countOnly = true; } else { cout << "bad option: " << argv[argi] << endl; } ++argi; } Bwt<PackedDNASequence, FASTASequence> bwt; bwt.Read(bwtFileName); FASTAReader queryReader; queryReader.Init(querySeqFileName); FASTASequence seq; int seqIndex = 0; vector<DNALength> positions; while(queryReader.GetNext(seq)) { positions.clear(); if (countOnly == false) { bwt.Locate(seq, positions, maxCount); } else { DNALength sp,ep; bwt.Count(seq, sp, ep); } // cout << "matched " << positions.size() << " positions." << endl; if (doPrintResults) { int i; for (i = 0; i < positions.size(); i++ ){ cout << positions[i] << " "; } cout << endl; } ++seqIndex; } // float wordCountsPerLookup = (bwt.bwtSequence.nCountInWord *1.0) / bwt.bwtSequence.nCountNuc; // cout << "word counts per lookup: " << wordCountsPerLookup << endl; return 0; }
int main(int argc, char* argv[]) { std::string seqInName, seqOutName, dotOutName; if (argc < 4) { std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl; std::exit(EXIT_FAILURE); } seqInName = argv[1]; dotOutName = argv[2]; seqOutName = argv[3]; FASTAReader reader; reader.Initialize(seqInName); FASTASequence origSeq; reader.GetNext(origSeq); std::ifstream dotOutFile; CrucialOpen(dotOutName, dotOutFile); std::ofstream seqOutFile; std::ofstream seqOut; CrucialOpen(seqOutName, seqOut, std::ios::out); std::string dotOutLine; getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); while (getline(dotOutFile, dotOutLine)) { std::stringstream lineStrm(dotOutLine); int swScore; float pctDiv, pctDel, pctIns; std::string query; DNALength qPosBegin, qPosEnd; std::string left; char strand; std::string matchingRepeat; std::string repClass; std::string repPos, repEnd, repLeft; int id; lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >> left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id; for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) { origSeq.seq[seqPos] = 'X'; } } DNALength seqPos, unexPos; unexPos = 0; for (seqPos = 0; seqPos < origSeq.length; seqPos++) { if (origSeq.seq[seqPos] != 'X') { origSeq.seq[unexPos] = origSeq.seq[seqPos]; unexPos++; } } origSeq.length = unexPos; origSeq.PrintSeq(seqOut); return 0; }
/* * Wrapper for FASTAReader */ bool read_seqs(FASTAReader &reader1, FASTAReader &reader2, Buffer<int16_t> *seqs1, Buffer<int16_t> *seqs2, int *seqs1_len, int *seqs2_len, std::vector<std::string> *seqs1_ids, std::vector<std::string> *seqs2_ids) { bool ans; #pragma omp critical ans = reader1.next(seqs1, seqs1_len, seqs1_ids) && reader2.next(seqs2, seqs2_len, seqs2_ids); return ans; }
int main(int argc, char* argv[]) { string genomeFileName, subseqFileName; if (argc != 3) { cout << "usage: extractRepeats genome repeat" << endl; exit(0); } genomeFileName = argv[1]; subseqFileName = argv[2]; FASTASequence genome, sub; FASTAReader reader; reader.Init(genomeFileName); reader.GetNext(genome); reader.Init(subseqFileName); reader.GetNext(sub); genome.ToUpper(); sub.ToUpper(); DNALength genomePos; FASTASequence genomeSub; int kband = (int) (0.15) * sub.length; vector<int> scoreMat; vector<Arrow> pathMat; int readIndex = 0; cout << "starting extraction" << endl; for (genomePos = 0; genomePos < genome.length - sub.length + 1; genomePos++) { genomeSub.seq = &genome.seq[genomePos]; genomeSub.length = sub.length; int alignScore; Alignment alignment; alignScore = SWAlign(genomeSub, sub, EditDistanceMatrix, 1, //1,kband, scoreMat, pathMat, alignment, QueryFit); if (alignScore < 0.25 * sub.length) { stringstream titlestrm; titlestrm << readIndex << "|" << genomePos << "|" << genomePos + sub.length << " " << alignScore/ (1.0*sub.length); FASTASequence subcopy; subcopy.CopyTitle(titlestrm.str()); subcopy.seq = &genome.seq[genomePos]; subcopy.length = sub.length; subcopy.PrintSeq(std::cout); genomePos += sub.length; } } }
int main(int argc, char *argv[]) { string sequencesInName, sequencesOutName; if (argc <3){ cout << "usage: scramble in out" << endl; exit(1); } sequencesInName = argv[1]; sequencesOutName= argv[2]; vector<FASTASequence*> sequences; vector<int> sequenceIndices; FASTAReader reader; reader.Init(sequencesInName); ofstream out; CrucialOpen(sequencesOutName, out, std::ios::out); FASTASequence read; FASTASequence*readPtr; while(reader.GetNext(read)) { readPtr = new FASTASequence; *readPtr = read; sequences.push_back(readPtr); } int i; for (i = 0; i < sequences.size(); i++) { sequenceIndices.push_back(i); } for (i = 0; i < 10*sequences.size(); i++ ){ // // shuffle indices. // int idx1; int idx2; idx1 = RandomInt(sequences.size()); idx2 = RandomInt(sequences.size()); int tmp; tmp = sequenceIndices[idx1]; sequenceIndices[idx1] = sequenceIndices[idx2]; sequenceIndices[idx2] = tmp; } for (i = 0; i < sequenceIndices.size(); i++ ){ sequences[sequenceIndices[i]]->PrintSeq(out); } return 0; }
int main(int argc, char* argv[]) { CommandLineParser clp; string fastaFileName, indexFileName; vector<string> fastaFileNames; vector<string> opts; clp.SetProgramName("bsdb"); clp.SetProgramSummary("Build an index database on a file of sequences.\n" " The index is used to map to reads given alignment positions.\n"); clp.RegisterStringOption("fasta", &fastaFileName, "A file with sequences to build an index."); clp.RegisterStringOption("index", &indexFileName, "The index file."); clp.RegisterPreviousFlagsAsHidden(); clp.ParseCommandLine(argc, argv, opts); ifstream fastaIn; ofstream indexOut; if (FileOfFileNames::IsFOFN(fastaFileName)) { FileOfFileNames::FOFNToList(fastaFileName, fastaFileNames); } else { fastaFileNames.push_back(fastaFileName); } CrucialOpen(indexFileName, indexOut, std::ios::out | std::ios::binary); SequenceIndexDatabase<FASTASequence> seqDB; int fileNameIndex; for (fileNameIndex = 0; fileNameIndex < fastaFileNames.size(); fileNameIndex++){ FASTAReader reader; FASTASequence seq; reader.Init(fastaFileNames[fileNameIndex]); int i = 0; while (reader.GetNext(seq)) { seqDB.AddSequence(seq); i++; } } seqDB.Finalize(); seqDB.WriteDatabase(indexOut); return 0; }
int main(int argc, char* argv[]) { if (argc < 4) { cout << "usage: splitContigs in.fa contiglength out" << endl; exit(1); } string inFileName, outFileName; inFileName = argv[1]; int contigLength = atoi(argv[2]); outFileName = argv[3]; ofstream seqOut; CrucialOpen(outFileName, seqOut, std::ios::out); FASTAReader reader; reader.Init(inFileName); FASTASequence seq; DNALength curOffset; while(reader.GetNext(seq)) { FASTASequence subseq; int i; curOffset = 0; for (i =0 ; i < seq.length / contigLength + 1; i++ ) { subseq.seq = &seq.seq[curOffset]; subseq.title = seq.title; if (curOffset + contigLength > seq.length) { subseq.length = seq.length - curOffset; } else { subseq.length = contigLength; } subseq.PrintSeq(seqOut); curOffset += contigLength; } } return 0; }
int main(int argc, char* argv[]) { string refFileName, queryFileName; int maxHammingDistance; if (argc < 4) { cout << "usage: hammer ref query maxHam " << endl; exit(1); } refFileName = argv[1]; queryFileName = argv[2]; maxHammingDistance = atoi(argv[3]); FASTAReader reader; reader.Initialize(refFileName); FASTASequence ref, refRC; reader.GetNext(ref); ref.MakeRC(refRC); FASTAReader queryReader; queryReader.Initialize(queryFileName); FASTASequence query; queryReader.GetNext(query); DNALength p; for(p=0; p < ref.length-query.length-1; p++ ){ DNASequence subseq; subseq.seq = &ref.seq[p]; subseq.length = query.length; // cout << "t "; subseq.PrintSeq(cout); // cout << "q "; ((DNASequence*)&query)->PrintSeq(cout); if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) { cout << ">" << p << endl; subseq.PrintSeq(cout); } int i; for (i =0; i < query.length; i++) { subseq.seq[i] = toupper(subseq.seq[i]); } } for(p=0; p < ref.length-query.length-1; p++ ){ DNASequence subseq; subseq.seq = &refRC.seq[p]; subseq.length = query.length; if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) { cout << ">" << p << "rc" << endl; subseq.PrintSeq(cout); } int i; for (i =0; i < query.length; i++) { subseq.seq[i] = toupper(subseq.seq[i]); } } }
int main(int argc, char* argv[]) { string refFileName, notNormalFileName, normalFileName; if (argc < 4) { cout << "usage: normalizeGCContent ref source dest " << endl << " flips the C/Gs in source randomly until they are the same gc content as ref." << endl; exit(1); } refFileName = argv[1]; notNormalFileName = argv[2]; normalFileName = argv[3]; FASTAReader reader; FASTAReader queryReader; FASTASequence ref; vector<FASTASequence> querySequences; int queryTotalLength; reader.Initialize(refFileName); reader.ReadAllSequencesIntoOne(ref); queryReader.Initialize(notNormalFileName); int refCounts[5], queryCounts[5]; int s; refCounts[0] = refCounts[1] =refCounts[2] = refCounts[3] = refCounts[4] = 0; queryCounts[0] = queryCounts[1] =queryCounts[2] = queryCounts[3] = queryCounts[4] = 0; queryReader.ReadAllSequences(querySequences); ofstream normOut; CrucialOpen(normalFileName, normOut); CountNucs(ref, refCounts); float refGC = (1.0*refCounts[TwoBit['c']] + refCounts[TwoBit['g']]) / (refCounts[TwoBit['a']] + refCounts[TwoBit['c']] + refCounts[TwoBit['g']] + refCounts[TwoBit['t']]); int q; for (q = 0; q < querySequences.size(); q++) { CountNucs(querySequences[q], queryCounts); } float queryGC = (1.0*queryCounts[TwoBit['c']] + queryCounts[TwoBit['g']]) / (queryCounts[TwoBit['a']] + queryCounts[TwoBit['c']] + queryCounts[TwoBit['g']] + queryCounts[TwoBit['t']]); float gcToat = 0.0; float atTogc = 0.0; if (refGC > queryGC) { atTogc = (refGC - queryGC); } else { gcToat = (queryGC - refGC); } DNALength queryGenomeLength = queryCounts[0] + queryCounts[1] + queryCounts[2] + queryCounts[3] + queryCounts[4]; DNALength unmaskedQueryLength = queryCounts[0] + queryCounts[1] + queryCounts[2] + queryCounts[3]; DNALength ngc2at = unmaskedQueryLength * gcToat; DNALength nat2gc = unmaskedQueryLength * atTogc; cout << refGC << " " << queryGC << " " << gcToat << " " << atTogc << " " << ngc2at << " " << nat2gc << endl; vector<FASTASequence> normalized; normalized.resize(querySequences.size()); vector<DNALength> cumLengths; cumLengths.resize(normalized.size()+1); cumLengths[0] = 0; for (q = 0; q < querySequences.size(); q++) { normalized[q] = querySequences[q]; cumLengths[q+1] = cumLengths[q] + querySequences[q].length; } DNALength i; for (i = 0; i < ngc2at; i+=2) { DNALength pos, chr; FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'G', chr, pos); normalized[chr].seq[pos] = 'A'; FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'C', chr, pos); normalized[chr].seq[pos] = 'T'; } for (i = 0; i < nat2gc; i+=2) { DNALength pos, chr; FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'A', chr, pos); normalized[chr].seq[pos] = 'g'; FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'T', chr, pos); normalized[chr].seq[pos] = 'c'; } for (q = 0; q < normalized.size(); q++ ){ normalized[q].PrintSeq(normOut); } }
int main(int argc, char* argv[]) { CommandLineParser clp; string refGenomeName; string mutGenomeName; string gffFileName; float insRate = 0; float delRate = 0; float mutRate = 0; bool lower = false; gffFileName = ""; clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true); clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome."); clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false); vector<string> leftovers; clp.ParseCommandLine(argc, argv, leftovers); FASTAReader reader; FASTASequence refGenome; reader.Init(refGenomeName); ofstream mutGenomeOut; CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out); ofstream gffOut; if (gffFileName != "") { CrucialOpen(gffFileName, gffOut, std::ios::out); } vector<int> insIndices, delIndices, subIndices; int readIndex = 0; InitializeRandomGeneratorWithTime(); while (reader.GetNext(refGenome)) { insIndices.resize(refGenome.length); delIndices.resize(refGenome.length); subIndices.resize(refGenome.length); std::fill(insIndices.begin(), insIndices.end(), false); std::fill(delIndices.begin(), delIndices.end(), false); std::fill(subIndices.begin(), subIndices.end(), 0); enum ChangeType { Ins, Del, Mut, None}; float changeProb[4]; changeProb[Ins] = insRate; changeProb[Del] = changeProb[Ins] + delRate; changeProb[Mut] = changeProb[Del] + mutRate; changeProb[None] = 1; if (changeProb[Mut] > 1) { cout << "ERROR! The sum of the error probabilities must be less than 1" << endl; exit(1); } DNALength pos; float randomNumber; int numIns = 0; int numDel = 0; int numMut = 0; for (pos =0 ; pos < refGenome.length; pos++) { randomNumber = Random(); if (randomNumber < changeProb[Ins]) { insIndices[pos] = true; numIns++; } else if (randomNumber < changeProb[Del]) { delIndices[pos] = true; numDel++; } else if (randomNumber < changeProb[Mut]){ Nucleotide newNuc = TwoBitToAscii[RandomInt(4)]; int maxIts = 100000; int it = 0; while (newNuc == refGenome.seq[pos]) { newNuc = TwoBitToAscii[RandomInt(4)]; if (it == maxIts) { cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl; exit(1); } } subIndices[pos] = refGenome[pos]; refGenome.seq[pos] = ToLower(newNuc,lower); ++numMut; } } // cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl; if (readIndex % 100000 == 0 && readIndex > 0) { cout << readIndex << endl; } // // Now add the insertions and deletions. // FASTASequence newSequence; DNALength newPos; if (numIns - numDel + refGenome.length < 0) { cout << "ERROR, the genome has been deleted to nothing." << endl; exit(1); } ResizeSequence(newSequence, refGenome.length + (numIns - numDel)); newPos = 0; pos = 0; for (pos = 0; pos < refGenome.length; pos++) { assert(newPos < newSequence.length or delIndices[pos] == true); if (subIndices[pos] != 0 and gffFileName != "") { gffOut << refGenome.GetName() << " . SNV " << newPos << " " << newPos <<" 0.00 . . reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl; } if (insIndices[pos] == true) { newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower); newPos++; newSequence.seq[newPos] = refGenome.seq[pos]; assert(newSequence.seq[newPos] != '1'); assert(newSequence.seq[newPos] != 1); if (gffFileName != "") { gffOut << refGenome.GetName() << " . deletion " << newPos << " " << newPos << " 0.00 . . reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl; } newPos++; } else if (delIndices[pos] == true) { // no-op, skip if (gffFileName != "") { gffOut << refGenome.GetName() << " . insertion " << newPos << " " << newPos << " 0.00 . . confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl; //ref000001 . deletion 20223 20223 0.00 . . reference=T;length=1;confidence=0;coverage=0;Name=20222delT } } else { newSequence.seq[newPos] = refGenome.seq[pos]; newPos++; } } stringstream titlestrm; titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate; newSequence.CopyTitle(refGenome.title); newSequence.AppendToTitle(titlestrm.str()); newSequence.PrintSeq(mutGenomeOut); newSequence.Free(); readIndex++; } }
int main(int argc, char* argv[]) { FASTAReader reader; if (argc < 5) { cout << "usage: wordCounter seqFile tupleSize tupleOutputFile posOutputFile" << endl; exit(1); } string fileName = argv[1]; int tupleSize = atoi(argv[2]); string tupleListName = argv[3]; string posOutName = argv[4]; TupleMetrics tm; tm.Initialize(tupleSize); reader.Init(fileName); FASTASequence seq; reader.GetNext(seq); vector<CountedDNATuple> tupleList; CountedDNATuple tuple; DNALength i; for (i = 0; i < seq.length - tm.tupleSize + 1; i++ ) { if (tuple.FromStringRL((Nucleotide*) (seq.seq + i), tm)) { tuple.count = i; tupleList.push_back(tuple); } } std::sort(tupleList.begin(), tupleList.end()); int t; int t2; int numTuples = tupleList.size(); t = t2 = 0; int numUnique = 0; while (t < numTuples) { t2 = t; t2++; while (t2 < numTuples and tupleList[t] == tupleList[t2]) { t2++; } ++numUnique; t = t2; } ofstream countedTupleListOut; countedTupleListOut.open(tupleListName.c_str(), ios_base::binary); ofstream posOut; posOut.open(posOutName.c_str(), ios_base::binary); countedTupleListOut.write((const char*) &numUnique, sizeof(int)); countedTupleListOut.write((const char*) &tm.tupleSize, sizeof(int)); posOut.write((const char*) &numUnique, sizeof(int)); // // Write out the tuple+counts to a file. // t = t2 = 0; CountedDNATuple countedTuple; int numMultOne = 0; while (t < numTuples) { t2 = t; t2++; while (t2 < numTuples and tupleList[t] == tupleList[t2]) { t2++; } countedTuple.tuple = tupleList[t].tuple; countedTuple.count = t2 - t; if (countedTuple.count == 1) ++numMultOne; countedTupleListOut.write((const char*) &countedTuple,sizeof(CountedDNATuple)); posOut.write((char*)&countedTuple.count, sizeof(int)); int tc; for (tc = t; tc < t2; tc++) { posOut.write((char*) &tupleList[tc].count, sizeof(int)); } t = t2; } // // Write out the positions of the tuples to a file. // posOut.close(); countedTupleListOut.close(); // cout << "found " << numUnique << " distinct " << DNATuple::TupleSize << "-mers." << endl; cout << numMultOne << endl; return 0; }
int main(int argc, char* argv[]) { string cmpFileName; string refFileName; string readsFileName; string mapqvTrackName; if (argc < 2) { cout << " printMapqvTrack: print a gff file of the average mapping quality value" << endl; exit(1); } vector<int> refPositions; cmpFileName = argv[1]; refFileName = argv[2]; mapqvFileName = argv[3]; CmpFile cmpFile; FASTASequence ref; FASTAReader reader; reader.Initialize(refFileName); reader.GetNext(ref); HDFBasReader basReader; SMRTSequence seq, *seqPtr; vector<int> refCoverage; refCoverage.resize(ref.length); std::fill(refCoverage.begin(), refCoverage.end(), 0); /* * These guys pull information from the same pls file. */ HDFCmpReader<CmpAlignment> cmpReader; if (cmpReader.Initialize(cmpFileName) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(1); } cmpReader.Read(cmpFile); UInt alignmentIndex; // movieIndexSets.resize(nMovies); for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) { int refSeqId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefSeqId(); int readGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetReadGroupId(); int refSeqIdIndex; if (cmpFile.refSeqTable.GetIndexOfId(refSeqId, refSeqIdIndex) == false) { // // Sanity check -- we're only looking at alignments to references in the cmp file. // cout << "ERROR, ref seq id: " << refSeqId << " should exist in the cmp file but it does not." << endl; assert(0); } int readGroupIdIndex; cmpFile.readGroupTable.GetIndexOfId(readGroupId, readGroupIdIndex); string readGroupPath = cmpFile.readGroupTable.names[readGroupIdIndex]; string readGroup = cmpReader.readGroupPathToReadGroup[readGroupPath]; int readGroupArrayIndex = cmpReader.refAlignGroups[refSeqIdIndex]->experimentNameToIndex[readGroup]; vector<char> alignedSequence, alignedTarget; // // This read overlaps one of the ref positions. UInt offsetEnd, offsetBegin; offsetEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd(); offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); vector<unsigned char> byteAlignment; int alignedSequenceLength = offsetEnd - offsetBegin; if (alignedSequenceLength >= 0) { alignedSequence.resize(alignedSequenceLength); alignedTarget.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } cmpReader.refAlignGroups[refSeqIdIndex]->readGroups[readGroupArrayIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]); UInt refStart = cmpFile.alnInfo.alignments[alignmentIndex].GetRefStart(); UInt refEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetRefEnd(); UInt readStart= cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart(); UInt readEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd(); // // Read the alignment string. // if (refSeqIdIndex > 0) continue; // // Convert to something we can compare easily. // alignedSequence[alignedSequence.size()-1]= '\0'; ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]); ByteAlignmentToRefString(&byteAlignment[0], byteAlignment.size(), &alignedTarget[0]); int gi, i; gi = 0; int refStrand = cmpFile.alnInfo.alignments[alignmentIndex].GetRCRefStrand(); if (refStrand == 1) { // revcomp the ref strand vector<char> rcAlignedTarget, rcAlignedQuery; int t; rcAlignedTarget.resize(alignedTarget.size()); rcAlignedQuery.resize(alignedSequence.size()); for (t = 0; t < alignedTarget.size(); t++) { if (alignedTarget[t] == ' ') { rcAlignedTarget[alignedTarget.size() - t - 1] = ' '; } else { rcAlignedTarget[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]]; } if (alignedSequence[t] == ' '){ rcAlignedQuery[alignedTarget.size() - t - 1] = ' '; } else { rcAlignedQuery[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]]; } } alignedTarget = rcAlignedTarget; alignedSequence = rcAlignedQuery; } int holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber(); int ri = readStart; gi = refStart; for (i = 0; i < alignedTarget.size(); i++, gi++, ri++ ) { while(i < alignedTarget.size() and alignedTarget[i] == ' ') { i++; } if (alignedSequence[i] != ' ') { refCoverage[gi]++; } } } // end looping over regions // Now compute the number of gaps. UInt pos; int numNotCovered = 0; for (pos = 0; pos < refCoverage.size(); pos++ ){ if (refCoverage[pos] < 1) { numNotCovered++;} } if (numNotCovered > 100) { cout << "TOO Many!!!" << endl; } else { for (pos = 0; pos < refCoverage.size(); pos++ ){ // cout << refCoverage[pos] << endl; if (refCoverage[pos] < 1) { int left, right; left = right = -1; if (pos > 0) { left = refCoverage[pos-1];} if (pos < refCoverage.size()-1) {right = refCoverage[pos+1];} cout << pos << " " << left << " " << right << endl; } } } }
int main(int argc, char* argv[]) { string rgInName, rgOutName; int minPathLength; string vertexSequenceFileName; if (argc < 5) { cout << "usage: trimShortEnds in.rg vertexSequences minPathLength out.rg" << endl; exit(1); } rgInName = argv[1]; vertexSequenceFileName = argv[2]; minPathLength = atoi(argv[3]); rgOutName = argv[4]; ofstream rgOut; CrucialOpen(rgOutName, rgOut, std::ios::out); FASTAReader vertexSequenceReader; vertexSequenceReader.Init(vertexSequenceFileName); RepeatGraph<string> rg; vector<FASTASequence> vertexSequences; rg.ReadGraph(rgInName); vertexSequenceReader.ReadAllSequences(vertexSequences); VectorIndex vertexIndex; VectorIndex outEdgeIndex; VectorIndex edgeIndex; if (rg.edges.size() == 0) { cout << "LIKELY INVALID GRAPH. There are no edges." << endl; return 0; } // // At first, any edge that exists is connected to a vertex. This // will change as low coverage edges are deleted and replaced by // high coverage edges from the end of the array. // for (edgeIndex = 0; edgeIndex < rg.edges.size(); edgeIndex++) { rg.edges[edgeIndex].connected = true; } set<std::pair<VectorIndex, VectorIndex> > srcDestToRemove; for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++) { if (rg.vertices[vertexIndex].inEdges.size() == 0 and rg.vertices[vertexIndex].outEdges.size() == 1) { // // This is a source. Traverse this until a branching vertex or the end is found. // vector<VectorIndex> path; path.push_back(vertexIndex); int pathLength = 0; VectorIndex pathVertex; VectorIndex pathEdge; pathEdge = rg.vertices[vertexIndex].outEdges[0]; pathVertex = rg.edges[pathEdge].dest; while (rg.vertices[pathVertex].inEdges.size() == 1 and rg.vertices[pathVertex].outEdges.size() == 1) { path.push_back(pathVertex); pathEdge = rg.vertices[pathVertex].outEdges[0]; pathVertex = rg.edges[pathEdge].dest; pathLength += vertexSequences[pathVertex/2].length; } pathLength += vertexSequences[pathVertex/2].length; path.push_back(pathVertex); if (pathLength < minPathLength and path.size() < 3) { // // Remove this path, it is too short. // Also remove the complement. // cout << "trimming path of " << path.size() << " is of sequence length " << pathLength << endl; VectorIndex pathIndex; for (pathIndex = 0; pathIndex < path.size() - 1; pathIndex++) { srcDestToRemove.insert(pair<VectorIndex, VectorIndex>(path[pathIndex], path[pathIndex+1])); srcDestToRemove.insert(pair<VectorIndex, VectorIndex>(2*(path[pathIndex+1]/2) + !(path[pathIndex+1]%2), 2*(path[pathIndex]/2) + !(path[pathIndex]%2))); } } } } MarkEdgePairsForRemoval(srcDestToRemove, rg.vertices, rg.edges); RemoveUnconnectedEdges(rg.vertices, rg.edges); rg.WriteGraph(rgOut); return 0; }
int main(int argc, char* argv[1]) { if (argc < 3) { cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl; cout << " genome.fasta.sa must exist." << endl; cout << " Finds sequences at least effective_k in length that are unique." << endl; cout << " -max m Allow up to m matches" << endl; cout << " -minLength l Ensure the length of the match is at least this." << endl; cout << " -prefix p n Allow up to n matches across a prefix of length p" << endl; cout << " -suffix s n Allow up to n matches across a suffix of length s" << endl; cout << " Prefix and suffix options override max." << endl; cout << " -out file Print queries to this output file (query.fasta.queries)" << endl; exit(0); } DNASuffixArray sarray; string genomeFileName = argv[1]; string suffixArrayFileName = genomeFileName + ".sa"; FASTAReader reader; FASTASequence genome; int maxN = 0; int prefix = 0; int suffix = 0; int prefixN = 0; int suffixN = 0; int argi = 4; string outputFileName = ""; int minLength = 0; while (argi < argc) { if (strcmp(argv[argi], "-max") == 0) { ++argi; maxN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-prefix") == 0) { ++argi; prefix = atoi(argv[argi]); ++argi; prefixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-suffix") == 0) { ++argi; suffix = atoi(argv[argi]); ++argi; suffixN = atoi(argv[argi]); } else if (strcmp(argv[argi], "-out") == 0) { ++argi; outputFileName = argv[argi]; } else if (strcmp(argv[argi], "-minLength") == 0) { ++argi; minLength = atoi(argv[argi]); } ++argi; } reader.Initialize(genomeFileName); reader.ReadAllSequencesIntoOne(genome); sarray.Read(suffixArrayFileName); FASTAReader queryReader; FASTASequence querySequence; string queryFileName = argv[2]; int maxLength = atoi(argv[3]); string summaryTableFileName = queryFileName + ".summary"; if (outputFileName == "") { outputFileName = queryFileName + ".queries"; } ofstream summaryTable(summaryTableFileName.c_str()); ofstream outputFile(outputFileName.c_str()); queryReader.Initialize(queryFileName); while (queryReader.GetNext(querySequence)) { int i; cerr << "searching " << querySequence.title << endl; if (querySequence.length < maxLength) { continue; } int nMatches = 0; querySequence.ToUpper(); int localMax; for (i = 0; i < querySequence.length - maxLength + 1; i++) { if ((i + 1) % 100000 == 0) { cerr << "processed: " << i + 1 << endl; } int lcpLength; vector<SAIndex> lcpLeftBounds, lcpRightBounds; vector<SAIndex> rclcpLeftBounds, rclcpRightBounds; localMax = maxN; if (i < prefix) { localMax = prefixN; } if (i >= querySequence.length - suffix) { localMax = suffixN; } if (querySequence.length - i <= maxLength) { continue; } if (querySequence.seq[i] == 'N') { continue; } lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. &querySequence.seq[i], querySequence.length-i, true, maxLength, lcpLeftBounds, lcpRightBounds, false); if (lcpLength < minLength) { continue; } if (lcpLength < maxLength or lcpRightBounds.size() == 0 or (lcpRightBounds.size() > 0 and lcpLeftBounds.size() > 0 and lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) { FASTASequence rc; DNASequence subseq; subseq.ReferenceSubstring(querySequence, i, maxLength); subseq.MakeRC(rc); int rclcpLength; int numForwardMatches; if (lcpLength == 0) { numForwardMatches = 0; } else { numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1]; } rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on. rc.seq, maxLength, true, rclcpLength, rclcpLeftBounds, rclcpRightBounds, false); string rcstr((const char*)rc.seq, rc.length); if (rclcpLength < maxLength or rclcpRightBounds.size() == 0 or (numForwardMatches + rclcpRightBounds[rclcpRightBounds.size() - 1] - rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) { char* substr = new char[maxLength+1]; substr[maxLength] = '\0'; memcpy(substr, &querySequence.seq[i], maxLength); // string substr = string((const char*) querySequence.seq, i, maxLength); outputFile << querySequence.title << "\t" << substr << "\t" << i << endl; ++nMatches; delete[] substr; // } } rc.Free(); } } summaryTable << querySequence.title << "\t" << nMatches << endl; querySequence.Free(); } outputFile.close(); genome.Free(); }
int main(int argc, char* argv[]) { string program = "samtoh5"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, cmpFileName, refFileName; bool parseSmrtTitle = false; bool useShortRefName = false; CommandLineParser clp; string readType = "standard"; int verbosity = 0; clp.SetProgramName(program); clp.SetProgramSummary("Converts in.sam file to out.cmp.h5 file."); clp.SetVersion(versionString); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file.", true); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads.", true); clp.RegisterStringOption("out.cmp.h5", &cmpFileName, "Output cmp.h5 file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when converting alignments " "generated from reads produced by the " "pls2fasta from bas.h5 files by parsing read " "coordinates from the SMRT read title. The title " "is in the format /name/hole/coordinates, where " "coordinates are in the format \\d+_\\d+, and " "represent the interval of the read that was " "aligned."); clp.RegisterStringOption("readType", &readType, "Set the read type: 'standard', 'strobe', 'CCS', " "or 'cDNA'"); clp.RegisterIntOption("verbosity", &verbosity, "Set desired verbosity.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); string description = ("Because SAM has optional tags that have different " "meanings in different programs, careful usage is required in order to " "have proper output. The \"xs\" tag in bwa-sw is used to show the " "suboptimal score, but in PacBio SAM (blasr) it is defined as the start " "in the query sequence of the alignment.\nWhen \"-smrtTitle\" is " "specified, the xs tag is ignored, but when it is not specified, the " "coordinates given by the xs and xe tags are used to define the interval " "of a read that is aligned. The CIGAR string is relative to this interval."); clp.SetExamples(description); clp.ParseCommandLine(argc, argv); if (readType != "standard" and readType != "strobe" and readType != "cDNA" and readType != "CCS") { cout << "ERROR. Read type '" << readType << "' must be one of either 'standard', 'strobe', 'cDNA' or 'CCS'." << endl; exit(1); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader; FASTAReader fastaReader; HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > cmpFile; // // Initialize input/output files. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); cmpFile.Create(cmpFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Convert sam to cmp.h5"; cmpFile.fileLogGroup.AddEntry(command, log, program, GetTimestamp(), versionString); // // Set the readType // cmpFile.SetReadType(readType); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); // // This should probably be handled by the alignmentSetAdapter, but // time constraints... // AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // // Always recompute the MD5 values even if they exist in the input // sam file. Because MD5 is defined differently in sam and cmp.h5 files. // The SAM convention uppercases and normalizes before computing the MD5. // For cmp.h5, we compute the MD5 on the sequence 'as is'. // for(int i = 0; i < alignmentSet.references.size(); i++) { MakeMD5((const char*)&references[i].seq[0], (unsigned int)references[i].length, alignmentSet.references[i].md5); } // // Map short names for references obtained from file.sam to full names obtained from reference.fasta // map<string, string> shortRefNameToFull; map<string, string>::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (int i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // // Start setting up the cmp.h5 file. // AlignmentSetToCmpH5Adapter<HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > > alignmentSetAdapter; alignmentSetAdapter.Initialize(); alignmentSetAdapter.StoreReferenceInfo(alignmentSet.references, cmpFile); // // Store the alignments. // SAMAlignment samAlignment; int alignIndex = 0; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } vector<AlignmentCandidate<> > convertedAlignments; if (verbosity > 0) { cout << "Storing alignment for " << samAlignment.qName << endl; } SAMAlignmentsToCandidates(samAlignment, references, alignmentSetAdapter.refNameToIndex, convertedAlignments, parseSmrtTitle, false); alignmentSetAdapter.StoreAlignmentCandidateList(convertedAlignments, cmpFile, alignIndex); int a; for (a = 0; a < convertedAlignments.size(); a++) { convertedAlignments[a].FreeSubsequences(); } ++alignIndex; /* if (alignIndex == 100) { return 0; }*/ } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; return 0; }
int main(int argc, char* argv[]) { string gencodeGffFileName, genomeFileName, genesOutFileName; string geneType = "protein_coding"; bool randomSplicing = false; int numRandomSplicing = 1; float pSkip = 0.5; if (argc < 4) { cout << "Usage: extractGenes gencodeGTFFile genomeFile genesOutFileName [-geneType type (protein_coding)] [-randomSplicing] [-numRandomSplicing n] [-pSkip prob (0-1, default:0.5)]" << endl; exit(1); } gencodeGffFileName = argv[1]; genomeFileName = argv[2]; genesOutFileName = argv[3]; int argi = 4; string coordinatesFileName; while (argi < argc) { if (strcmp(argv[argi], "-geneType") == 0) { geneType = argv[++argi]; } else if (strcmp(argv[argi], "-randomSplicing") == 0) { randomSplicing = true; } else if (strcmp(argv[argi], "-numRandomSplicing") == 0) { numRandomSplicing = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-pSkip") == 0) { pSkip = atof(argv[++argi]); } else { cout << "ERROR, bad option " << argv[argi] << endl; exit(1); } ++argi; } coordinatesFileName = genesOutFileName; coordinatesFileName.append(".pos"); FASTAReader reader; reader.Initialize(genomeFileName); ofstream outFile, coordsFile; CrucialOpen(genesOutFileName, outFile, std::ios::out); string coordsFileName = genesOutFileName + ".coords"; CrucialOpen(coordsFileName, coordsFile, std::ios::out); vector<FASTASequence> referenceSequences; reader.ReadAllSequences(referenceSequences); int i; map<string, int> titleToIndex; for (i = 0; i < referenceSequences.size(); i++) { titleToIndex[referenceSequences[i].title] = i; } GencodeGFFFile gencodeFile; gencodeFile.ReadAll(gencodeGffFileName); vector<GencodeGFFGene> genes; IndexGencodeGenes(gencodeFile, genes, geneType); for (i = 0; i < genes.size(); i++) { genes[i].OrderExonsByStart(); } int e; for (i = 0; i < genes.size(); i++) { FASTASequence geneSequence; geneSequence.CopyTitle(genes[i].geneName); if (titleToIndex.find(genes[i].chromosome) == titleToIndex.end()) { continue; } int chrIndex = titleToIndex[genes[i].chromosome]; string sequence = ""; // // Do nothing with 0 length exons. // if (genes[i].exons.size() == 0) { continue; } vector<FASTASequence> geneSequences; vector<GeneCoordinates> geneCoordinates; genes[i].GenerateGeneSequences(referenceSequences[chrIndex], geneSequences, geneCoordinates, randomSplicing); int gi; for (gi = 0; gi < geneSequences.size(); gi++) { if (genes[i].GetStrand() == '+') { geneSequences[gi].PrintSeq(outFile); } else { FASTASequence rc; geneSequences[gi].MakeRC(rc); rc.PrintSeq(outFile); rc.Free(); } coordsFile << geneSequences[gi].title << " " << geneCoordinates[gi].chromosome << " " << geneCoordinates[gi].exonCoordinates.size() << " " << geneCoordinates[gi].strand; int i; for (i = 0; i < geneCoordinates[gi].exonCoordinates.size(); i++) { coordsFile << " " << geneCoordinates[gi].exonCoordinates[i].start << " " << geneCoordinates[gi].exonCoordinates[i].end << " "; } coordsFile << endl; geneSequences[gi].Free(); } // // No need to free the seq, since it is controlled by the string. // } coordsFile.close(); }
int main(int argc, char* argv[]) { string genomeFileName; string suffixArrayFileName; if (argc < 4) { cout << "Usage: printWordCount genome suffixArray k [k2 k3 k4...]" << endl; exit(1); } genomeFileName = argv[1]; suffixArrayFileName = argv[2]; int argi = 3; vector<DNALength> k; while (argi < argc) { k.push_back(atoi(argv[argi])); argi++; } // Get the ref sequence. FASTAReader reader; reader.Init(genomeFileName); FASTASequence seq; // reader.GetNext(seq); reader.ReadAllSequencesIntoOne(seq); seq.ToUpper(); // Get the suffix array. DNASuffixArray sarray; sarray.Read(suffixArrayFileName); int ki; char *word; cout << "wordlen word nword" << endl; for (ki = 0; ki < k.size(); ki++) { word = new char[k[ki]+1]; word[k[ki]] = '\0'; DNALength i; DNALength numUnique = 0; for (i = 0; i < seq.length - k[ki] - 1; ) { DNALength j = i + 1; bool seqAtN = false; int si; for(si = 0; si < k[ki]; si++) { if (seq.seq[sarray.index[i] + si] == 'N') { seqAtN = true; break; } } if (seqAtN) { i++; continue; } while (j < seq.length - k[ki] and seq.length - sarray.index[i] >= k[ki] and seq.length - sarray.index[j] >= k[ki] and strncmp((const char*) &seq.seq[sarray.index[i]], (const char*) &seq.seq[sarray.index[j]], k[ki]) == 0) { j++; } if (seq.length - sarray.index[i] >= k[ki]) { for(si = 0; si < k[ki]; si++) { word[si] = seq.seq[sarray.index[i]+si]; } cout << k[ki] << " " << word << " " << j - i + 1 << endl; if (j == i + 1) { ++numUnique; } } i = j; } } }
int main(int argc, char* argv[]) { string inFileName, readsFileName; DNALength readLength; float coverage = 0; bool noRandInit = false; int numReads = -1; CommandLineParser clp; int qualityValue = 20; bool printFastq = false; int stratify = 0; string titleType = "pacbio"; string fastqType = "illumina"; // or "sanger" clp.RegisterStringOption("inFile", &inFileName, "Reference sequence", 0); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterIntOption("readLength", (int*) &readLength, "The length of reads to simulate. The length is fixed.", CommandLineParser::PositiveInteger, "Length of every read.", 0); clp.RegisterFloatOption("coverage", &coverage, "Total coverage (from which the number of reads is calculated", CommandLineParser::PositiveFloat, 0); clp.RegisterFlagOption("nonRandInit", &noRandInit, "Skip initializing the random number generator with time."); clp.RegisterIntOption("nReads", &numReads, "Total number of reads (from which coverage is calculated)", CommandLineParser::PositiveInteger, 0); clp.RegisterStringOption("readsFile", &readsFileName, "Reads output file", 0); clp.RegisterFlagOption("fastq", &printFastq, "Fake fastq output with constant quality value (20)"); clp.RegisterIntOption("quality", &qualityValue, "Value to use for fastq quality", CommandLineParser::PositiveInteger); clp.RegisterIntOption("stratify", &stratify, "Sample a read every 'stratify' bases, rather than randomly.", CommandLineParser::PositiveInteger); clp.RegisterStringOption("titleType", &titleType, "Set the name of the title: 'pacbio'|'illumina'"); clp.RegisterStringOption("fastqType", &fastqType, "Set the type of fastq: 'illumina'|'sanger'"); vector<string> leftovers; clp.ParseCommandLine(argc, argv, leftovers); if (!noRandInit) { InitializeRandomGeneratorWithTime(); } FASTAReader inReader; inReader.Init(inFileName); vector<FASTASequence> reference; inReader.ReadAllSequences(reference); ofstream readsFile; if (readsFileName == "") { cout << "ERROR. You must specify a reads file." << endl; exit(0); } CrucialOpen(readsFileName, readsFile, std::ios::out); ofstream sangerFastqFile; if (fastqType == "sanger") { string sangerFastqFileName = readsFileName + ".fastq"; CrucialOpen(sangerFastqFileName, sangerFastqFile, std::ios::out); } DNALength refLength = 0; int i; for (i = 0; i < reference.size(); i++) { refLength += reference[i].length; } if (numReads == -1 and coverage == 0 and stratify == 0) { cout << "Error, you must specify either coverage, nReads, or stratify." << endl; exit(1); } else if (numReads == -1) { numReads = (refLength / readLength) * coverage; } if (stratify) { if (!readLength) { cout << "ERROR. If you are using stratification, a read length must be specified." << endl; exit(1); } } DNASequence sampleSeq; sampleSeq.length = readLength; int maxRetry = 10000000; int retryNumber = 0; DNALength seqIndex, seqPos; if (stratify) { seqIndex = 0; seqPos = 0; } DNALength origReadLength = readLength; for (i = 0; stratify or i < numReads; i++) { if (stratify == 0) { FindRandomPos(reference, seqIndex, seqPos, readLength ); } else { // // find the next start pos, or bail if done // if (seqPos >= reference[seqIndex].length) { if (seqIndex == reference.size() - 1) { break; } else { seqIndex = seqIndex + 1; seqPos = 0; continue; } } readLength = min(reference[seqIndex].length - seqPos, origReadLength); } sampleSeq.seq = &reference[seqIndex].seq[seqPos]; int j; int gappedRead = 0; string title; stringstream titleStrm; if (titleType == "pacbio") { titleStrm << i << "|"<< reference[seqIndex].GetName() << "|" << seqPos << "|" << seqPos + readLength; } else if (titleType == "illumina") { titleStrm << "SE_" << i << "_0@" << seqPos << "-"<<seqPos+readLength <<"/1"; } else { cout << "ERROR. Bad title type " << titleType << endl; exit(0); } title = titleStrm.str(); sampleSeq.length = readLength; if (!printFastq) { readsFile << ">" << title << endl; sampleSeq.PrintSeq(readsFile); } else { FASTQSequence fastqSampleSeq; fastqSampleSeq.CopyTitle(title); fastqSampleSeq.seq = sampleSeq.seq; fastqSampleSeq.length = sampleSeq.length; fastqSampleSeq.qual.data = new unsigned char[sampleSeq.length]; fill(fastqSampleSeq.qual.data, fastqSampleSeq.qual.data + sampleSeq.length, qualityValue); if (fastqType == "illumina") { fastqSampleSeq.PrintFastq(readsFile, fastqSampleSeq.length+1); } else { fastqSampleSeq.PrintSeq(readsFile); fastqSampleSeq.PrintQual(sangerFastqFile); } delete[] fastqSampleSeq.qual.data; delete[] fastqSampleSeq.title; } if (stratify) { seqPos += readLength; } } return 0; }
int main(int argc, char* argv[]) { if (argc < 4) { PrintUsage(); exit(0); } string rgFileName, vertexSeqFileName, scaffoldDirName; rgFileName = argv[1]; vertexSeqFileName = argv[2]; scaffoldDirName = argv[3]; string repeatFileName = ""; bool printRepeatsSeparately = false; int argi = 4; bool printSeparate=false; while (argi < argc) { if (strcmp(argv[argi], "-separate") == 0) { printSeparate=true; } else if (strcmp(argv[argi], "-repeats") == 0) { printRepeatsSeparately = true; repeatFileName = argv[++argi]; } else { cout << "bad option: " << argv[argi] << endl; PrintUsage(); exit(1); } ++argi; } FASTAReader vertexSequenceReader; vertexSequenceReader.Init(vertexSeqFileName); // // Input necessary data // vector<FASTASequence> vertexSequences; vertexSequenceReader.ReadAllSequences(vertexSequences); RepeatGraph<string> rg; rg.ReadGraph(rgFileName); vector<FASTASequence> vertexRCSequences; VectorIndex vertexIndex; vertexRCSequences.resize(vertexSequences.size()); for (vertexIndex = 0; vertexIndex < vertexSequences.size(); vertexIndex++ ){ vertexSequences[vertexIndex].MakeRC(vertexRCSequences[vertexIndex]); } VectorIndex outEdgeIndex; int scaffoldIndex = 0; ofstream scaffoldOut; if (printSeparate==false) { // scaffold dir name is really a file name here. CrucialOpen(scaffoldDirName, scaffoldOut, std::ios::out); } for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++ ){ rg.vertices[vertexIndex].traversed = false; } // // Set up flow for calling multiplicity. // /* Test all this out later. AssignMinimumFlowToEdges(rg, 2); AssignVertexFlowBalance(rg); BalanceKirchhoffFlow(rg); UInt edgeIndex; for (edgeIndex = 0; edgeIndex < rg.edges.size(); edgeIndex++) { if (rg.edges[edgeIndex].flow > 1) { cout << edgeIndex << " " << rg.edges[edgeIndex].flow << endl; } } */ int numPrintedVertices = 0; for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++ ){ // // Look to see if this vertex is a branching vertex. // if ((rg.vertices[vertexIndex].inEdges.size() != 1 or rg.vertices[vertexIndex].outEdges.size() != 1) and rg.vertices[vertexIndex].traversed == false) { // // This is a branching vertex. Print all paths from this vertex, but not the vertex // itself if it appears repetitive. // VectorIndex outEdgeIndex; bool printedThisVertex = false; for (outEdgeIndex = 0; outEdgeIndex < rg.vertices[vertexIndex].outEdges.size(); outEdgeIndex++ ){ // // This is a branching vertex. // VectorIndex pathIndex; stringstream scaffoldFileNameStrm; cout << " printing scaffold: " << scaffoldIndex << endl; if (printSeparate) { scaffoldFileNameStrm << scaffoldDirName << "/" << scaffoldIndex << ".fasta"; string scaffoldFileName = scaffoldFileNameStrm.str(); CrucialOpen(scaffoldFileName, scaffoldOut, std::ios::out); } ++scaffoldIndex; // // Store the nonbranching path in a list so that it may be quickly processed. // bool pathIsPrinted = false; vector<VectorIndex> path; if (rg.vertices[vertexIndex].InDegree() == 0 and rg.vertices[vertexIndex].OutDegree() == 1) { path.push_back(vertexIndex); } VectorIndex pathVertex = rg.edges[rg.vertices[vertexIndex].outEdges[outEdgeIndex]].dest; while(rg.vertices[pathVertex].inEdges.size() == 1 and rg.vertices[pathVertex].outEdges.size() == 1) { if (rg.vertices[pathVertex].traversed == true) { pathIsPrinted = true; break; } path.push_back(pathVertex); // Mark the forward and reverse complement as traversed. pathVertex = rg.edges[rg.vertices[pathVertex].outEdges[0]].dest; // } // // Look to see if this is the end of a simple path, if so, add it to the scaffold. // pathVertex = rg.edges[rg.vertices[vertexIndex].outEdges[outEdgeIndex]].dest; if (rg.vertices[pathVertex].OutDegree() == 0 and rg.vertices[pathVertex].InDegree() == 1) { path.push_back(pathVertex); } // // Determine the sequences in the scaffold and the total scaffold length. // if (pathIsPrinted == false) { VectorIndex p; DNALength scaffoldLength = 0; for (p = 0; p < path.size(); p++ ){ scaffoldLength += vertexSequences[path[p]/2].length; rg.vertices[path[p]].traversed = true; // rg.vertices[2*(path[p]/2)+ !(path[p]%2)].traversed = true; ++numPrintedVertices; } cout << "path is of size " << path.size() << " length " << scaffoldLength << endl; if (!printSeparate) { scaffoldOut << ">" << scaffoldIndex << " " << path.size() << " " << scaffoldLength << endl; } for (p = 0; p < path.size(); p++) { if (printSeparate) { scaffoldOut << ">" << p << " " << path[p]/2 << " " << vertexSequences[path[p]/2].length << endl; } if (path[p]%2 == 0) { ((DNASequence)vertexSequences[path[p]/2]).PrintSeq(scaffoldOut); } else { ((DNASequence)vertexRCSequences[path[p]/2]).PrintSeq(scaffoldOut); } rg.vertices[path[p]].traversed = true; rg.vertices[2*(path[p]/2) + !(path[p]%2)].traversed = true; } if (printSeparate) { scaffoldOut.close(); scaffoldOut.clear(); } } } } } ofstream* outPtr; ofstream repeatOut; if (printRepeatsSeparately) { CrucialOpen(repeatFileName, repeatOut, std::ios::out); outPtr = &repeatOut; } else { outPtr = &scaffoldOut; } for (vertexIndex = 0; vertexIndex < rg.vertices.size(); vertexIndex++ ){ if (rg.vertices[vertexIndex].traversed == false) { // // Print this vertex sequence only. It is repetitive, or isolated. // *outPtr << ">" << scaffoldIndex << endl; ++scaffoldIndex; if (vertexIndex%2 == 0) { ((DNASequence)vertexSequences[vertexIndex/2]).PrintSeq(*outPtr); } else { ((DNASequence)vertexRCSequences[vertexIndex/2]).PrintSeq(*outPtr); } rg.vertices[vertexIndex].traversed = true; rg.vertices[2*(vertexIndex/2)+ !(vertexIndex%2)].traversed = true; } } cout << "printed: " << numPrintedVertices << " of " << rg.vertices.size() << endl; }
int main(int argc, char* argv[]) { string ad1File, ad2File, readsFile, readsOutFile; FASTAReader ad1Reader; FASTAReader ad2Reader; FASTAReader reader; CommandLineParser cl; float minPctSimilarity = 0.60; int indel = 3; int minLength = 10; cl.RegisterStringOption("ad1", &ad1File, "FASTA file with the first adapter"); cl.RegisterStringOption("ad2", &ad2File, "FASTA file with the second adapter"); cl.RegisterStringOption("reads", &readsFile, "FASTA file with SMRTBell reads"); cl.RegisterStringOption("readsout", &readsOutFile, "output file for split reads"); cl.RegisterPreviousFlagsAsHidden(); cl.RegisterFloatOption("pctSim", &minPctSimilarity, "Minimum percent similarity to trigger a match to an adapter.", CommandLineParser::PositiveFloat); cl.RegisterIntOption("indel", &indel, "Penalty for indel (positive)", CommandLineParser::NonNegativeInteger); cl.RegisterIntOption("minLength", &minLength, "Minimum length pass to retain.", CommandLineParser::PositiveInteger); vector<string> opts; cl.ParseCommandLine(argc, argv, opts); /* * Open all the required files, quitting if they are unavailable. */ ad1Reader.Init(ad1File); ad2Reader.Init(ad2File); reader.Init(readsFile); ofstream splitOut; CrucialOpen(readsOutFile, splitOut); FASTASequence ad1, ad2; ad1Reader.GetNext(ad1); ad2Reader.GetNext(ad2); FASTASequence read; vector<int> scoreMat; vector<Arrow> pathMat; int readIndex = 0; while(reader.GetNext(read)) { read.ToUpper(); // // Do a fitting sequence alignment to match one of the two // adapters into the read. // vector<int> passStarts, passLengths, la; read.PrintSeq(cout); SplitRead(read, 0, read.length, ad1, ad2, indel, passStarts, passLengths,la, 0, scoreMat, pathMat, minPctSimilarity, minLength); int i; for (i = 0; i < passStarts.size(); i++) { cout << "read: " << readIndex << " pass: "******" " << passStarts[i] << " " << passLengths[i] << " " << la[i] << endl; } ++readIndex; } }
int main(int argc, char* argv[]) { FASTAReader reader; FASTASequence read; int maxLength = 100; if (argc < 3) { cout << "usage: pairAlignAllContigs inFile maxLength equivalencies [-minIdent i]" << endl; exit(0); } string readsFileName, equivalenciesFileName; readsFileName = argv[1]; maxLength = atoi(argv[2]); equivalenciesFileName = argv[3]; int argi = 4; float minIdentity = 80; while (argi < argc) { if (strcmp(argv[argi], "-minIdent") == 0) { minIdentity = atoi(argv[++argi]); } ++argi; } vector<FASTASequence> reads, readsRC;; reader.Init(readsFileName); reader.ReadAllSequences(reads); readsRC.resize(reads.size()); int r; for (r =0; r < reads.size();r++) { reads[r].MakeRC(readsRC[r]); } ofstream equivOut; CrucialOpen(equivalenciesFileName, equivOut); Matrix<int> alignScores; Matrix<float> alignIdentities; alignScores.Resize(reads.size(), reads.size()); alignIdentities.Resize(reads.size(), reads.size()); vector<int> scoreMat; vector<Arrow> pathMat; int i, j; int alignScore; FASTASequence readi, readj; FASTASequence rcReadi, rcReadj; for (i = 0; i < reads.size(); i++) { float maxFrontIdent, maxEndIdent; int maxFrontIdentIndex, maxEndIdentIndex; maxFrontIdent = 0; maxEndIdent = 0; maxFrontIdentIndex = 0; maxEndIdentIndex = 0; int maxFrontIdentLength = 0; int maxEndIdentLength = 0; int maxFrontLength = 0; int maxEndLength = 0; int nmaxFrontLengthIndex = 0; int maxEndLengthIndex = 0; float maxFrontLengthIdent = 0; float maxEndLengthIdent = 0; int maxFrontLengthIndex = 0; equivOut << reads[i].GetName(); for (j = 0; j < reads.size(); j++ ){ // // Store the two ends of the alignment. // alignScore = 0; int rcAlignScore; Alignment alignment; Alignment rcAlignment; Alignment *optAlignment; if (i != j) { if (reads[i].length < maxLength and reads[j].length < maxLength) { alignScore = SWAlign(reads[i], reads[j], SMRTDistanceMatrix, 3, scoreMat, pathMat, alignment, Global); } if (reads[i].length < maxLength and reads[j].length < maxLength) { rcAlignScore = SWAlign(reads[i], readsRC[j], SMRTDistanceMatrix, 3, scoreMat, pathMat, rcAlignment, Global); } ComputeAlignmentStats(alignment, reads[i].seq, reads[j].seq, SMRTDistanceMatrix, 3,3 ); ComputeAlignmentStats(rcAlignment, reads[i].seq, readsRC[j].seq, SMRTDistanceMatrix, 3,3 ); if (alignment.pctSimilarity > minIdentity or rcAlignment.pctSimilarity > minIdentity) { equivOut << " " << reads[j].GetName(); } } } equivOut << endl; } return 0; }
int main(int argc, char* argv[]) { CommandLineParser clp; string readsFileName; string alignmentsFileName; string outputFileName; float minMergeIdentity = 0.70; clp.RegisterStringOption("reads", &readsFileName, "Reads used for alignments."); clp.RegisterStringOption("alignments", &alignmentsFileName, "SAM formatted alignments."); clp.RegisterIntOption("k", &vertexSize, "Minimum match length", CommandLineParser::PositiveInteger); clp.RegisterStringOption("outfile", &outputFileName, "Alignment output."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("v", &verbose, ""); clp.RegisterFloatOption("minMergeIdentity", &minMergeIdentity, "Minimum identity to merge paths.", CommandLineParser::PositiveFloat); clp.ParseCommandLine(argc, argv); if (minMergeIdentity < 0 or minMergeIdentity > 1) { cout << "ERROR. minMergeIdentity must be between 0 and 1" << endl; exit(1); } vector<FASTASequence> reads; FASTAReader fastaReader; fastaReader.Initialize(readsFileName); fastaReader.ReadAllSequences(reads); // // It is necessary to go from read title to index in the list of reads. // map<string, int> readNameToIndex; BuildReadNameToIndexMap(reads, readNameToIndex); ReadWordMatchVector readWordMatches; InitializeFromReads(reads, readWordMatches); // // Get ready to read in the alignments. // SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader; samReader.Initialize(alignmentsFileName); AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet; samReader.ReadHeader(alignmentSet); SAMAlignment samAlignment; AlignmentCandidate<> alignment; int numAlignedBases = 0; int alignmentIndex = 0; while ( samReader.GetNextAlignment( samAlignment ) ) { vector<AlignmentCandidate<> > alignments; SAMAlignmentsToCandidates(samAlignment, reads, readNameToIndex, alignments, false, true); int i; ++alignmentIndex; int a; for (a = 0; a < alignments.size();a++) { if (alignments[a].qName != alignments[a].tName) { MarkMatches(alignments[a], readNameToIndex, vertexSize, readWordMatches); } } if (alignmentIndex % 1000 == 0) { cout << alignmentIndex << endl; } } int numMatches = 0; int parentIndex = 1; int r; for (r = 0; r < readWordMatches.size(); r++) { readWordMatches[r].CreateParents(); numMatches += readWordMatches[r].pos.size(); } vector<int> parentIndices; parentIndices.resize(2*numMatches + 1); fill(parentIndices.begin(), parentIndices.end(), 0); // // Start indexing off at 1 so that 0 does not need to be treated in // a special case. // int curParentIndex = 1; cout << "There are " << numMatches << " matches." << endl; samReader.Close(); samReader.Initialize(alignmentsFileName); AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet2; samReader.ReadHeader(alignmentSet2); numAlignedBases = 0; alignmentIndex = 0; while ( samReader.GetNextAlignment( samAlignment ) ) { vector<AlignmentCandidate<> > alignments; SAMAlignmentsToCandidates(samAlignment, reads, readNameToIndex, alignments, false, true); int i; ++alignmentIndex; int a; for (a = 0; a < alignments.size();a++) { if (alignments[a].qName != alignments[a].tName) { JoinVertices(alignments[a], vertexSize, readNameToIndex, readWordMatches, curParentIndex, parentIndices); } } if (alignmentIndex % 1000 == 0) { cout << alignmentIndex << endl; } } vector<int> parentCounts; parentCounts.resize(parentIndices.size()); fill(parentCounts.begin(), parentCounts.end(), 0); int p; PromoteAll(parentIndices); int i; for (r = 0; r < readWordMatches.size(); r++) { for (i = 0; i < readWordMatches[r].parents.size(); i++) { readWordMatches[r].parents[i] = parentIndices[readWordMatches[r].parents[i]]; parentCounts[readWordMatches[r].parents[i]]++; } } /* for (i = 0; i < readWordMatches.size(); i++) { readWordMatches[i].PrintPos(cout); readWordMatches[i].PrintParents(cout); } */ map<int,int> hist; int numParents = 0; for (i = 1; i < parentCounts.size() && parentIndices[i] != 0; i++) { if (parentCounts[i] != 0) { ++numParents; } if (hist.find(parentCounts[i]) == hist.end()) { hist[parentCounts[i]] = 1; } else { hist[parentCounts[i]]++; } } map<int,int>::iterator histIt; cout << " freq count" << endl; for(histIt = hist.begin(); histIt != hist.end(); ++histIt) { cout << (*histIt).second << " " << (*histIt).first << endl; } MatchVertexList vertices; vertices.resize(numParents); cout << "there are " << numParents << " parents. " << endl; }
int main(int argc, char* argv[]) { if (argc < 2) { PrintUsage(); exit(1); } int argi = 1; string saFile = argv[argi++]; vector<string> inFiles; int doBLT = 1; int bltPrefixLength = 8; int parsingOptions = 0; SAType saBuildType = larsson; int read4BitCompressed = 0; int diffCoverSize = 0; while (argi < argc) { if (strlen(argv[argi]) > 0 and argv[argi][0] == '-'){ parsingOptions = 1; } if (!parsingOptions) { inFiles.push_back(argv[argi]); } else { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; if (argi < argc - 1) { bltPrefixLength = atoi(argv[++argi]); if (bltPrefixLength == 0) { cout << argv[argi] << " is not a valid lookup table length." << endl; exit(1); } } else { cout << "Please specify a lookup table length." << endl; exit(1); } } else if (strcmp(argv[argi], "-mamy") == 0) { saBuildType = manmy; } else if (strcmp(argv[argi], "-larsson") == 0) { saBuildType = larsson; } else if (strcmp(argv[argi], "-mcilroy") == 0) { saBuildType = mcilroy; } else if (strcmp(argv[argi], "-slow") == 0) { saBuildType = slow; } else if (strcmp(argv[argi], "-kark") == 0) { saBuildType = kark; } else if (strcmp(argv[argi], "-mafe") == 0) { saBuildType = mafe; } else if (strcmp(argv[argi], "-welter") == 0) { saBuildType = welter; } else if (strcmp(argv[argi], "-welterweight") == 0) { if (argi < argc-1) { diffCoverSize = atoi(argv[++argi]); } else { cout << "Please specify a difference cover size. Valid values are 7,32,64,111, and 2281. Larger values use less memory but may be slower." << endl; exit(1); } if ( ! (diffCoverSize == 7 or diffCoverSize == 32 or diffCoverSize == 64 or diffCoverSize == 111 or diffCoverSize == 2281) ) { cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl; cout << "Larger numbers use less space but are more slow." << endl; exit(1); } } else if (strcmp(argv[argi], "-4bit") == 0) { read4BitCompressed = 1; } else { PrintUsage(); cout << "ERROR, bad option: " << argv[argi] << endl; exit(1); } } ++argi; } if (inFiles.size() == 0) { // // Special use case: the input file is a fasta file. Write to that file + .sa // inFiles.push_back(saFile); saFile = saFile + ".sa"; } VectorIndex inFileIndex; FASTASequence seq; CompressedSequence<FASTASequence> compSeq; if (read4BitCompressed == 0) { for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) { FASTAReader reader; reader.Init(inFiles[inFileIndex]); reader.SetSpacePadding(111); if (saBuildType == kark) { // // The Karkkainen sa building method requires a little extra // space at the end of the dna sequence so that counting may // be done mod 3 without adding extra logic for boundaries. // } if (inFileIndex == 0) { reader.ReadAllSequencesIntoOne(seq); reader.Close(); } else { while(reader.ConcatenateNext(seq)) { cout << "added " << seq.title << endl; } } } seq.ToThreeBit(); //seq.ToUpper(); } else { assert(inFiles.size() == 1); cout << "reading compressed sequence." << endl; compSeq.Read(inFiles[0]); seq.seq = compSeq.seq; seq.length = compSeq.length; compSeq.RemoveCompressionCounts(); cout << "done." << endl; } // // For now, do not allow creation of suffix arrays on sequences > 4G. // if (seq.length >= UINT_MAX) { cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl; cout << "Consider breaking the reference into multiple files, running alignment. " << endl; cout << "against each file, and merging the result." << endl; exit(1); } vector<int> alphabet; SuffixArray<Nucleotide, vector<int> > sa; // sa.InitTwoBitDNAAlphabet(alphabet); // sa.InitAsciiCharDNAAlphabet(alphabet); sa.InitThreeBitDNAAlphabet(alphabet); if (saBuildType == manmy) { sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == mcilroy) { sa.index = new SAIndex[seq.length+1]; DNALength i; for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;} sa.index[seq.length] = 0; ssort(sa.index, NULL); for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];}; sa.length = seq.length; } else if (saBuildType == larsson) { sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == kark) { sa.index = new SAIndex[seq.length]; seq.ToThreeBit(); DNALength p; for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; } KarkkainenBuildSuffixArray<Nucleotide>(seq.seq, sa.index, seq.length, 5); sa.length = seq.length; } else if (saBuildType == mafe) { // sa.MaFeBuildSuffixArray(seq.seq, seq.length); } else if (saBuildType == welter) { if (diffCoverSize == 0) { sa.LightweightBuildSuffixArray(seq.seq, seq.length); } else { sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize); } } if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } sa.Write(saFile); return 0; }
int main(int argc, char* argv[]) { #ifdef USE_GOOGLE_PROFILER char *profileFileName = getenv("CPUPROFILE"); if (profileFileName != NULL) { ProfilerStart(profileFileName); } else { ProfilerStart("google_profile.txt"); } #endif // Register inputs and outputs. string samFileName, refFileName, outFileName; CommandLineParser clp; clp.RegisterStringOption("file.sam", &samFileName, "Input SAM file."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads."); clp.RegisterStringOption("out.sam", &outFileName, "Output SAM file."); clp.RegisterPreviousFlagsAsHidden(); // Register filter criteria options. int minAlnLength = 50; float minPctSimilarity = 70, minPctAccuracy = 70; string hitPolicyStr = "randombest"; bool useScoreCutoff = false; int scoreCutoff = INF_INT; int scoreSignInt = -1; RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, minPctAccuracy, hitPolicyStr, useScoreCutoff, scoreSignInt, scoreCutoff); int seed = 1; clp.RegisterIntOption("seed", &seed, "(1) Seed for random number generator.\n" "If seed is 0, then use current time as seed.", CommandLineParser::Integer); string holeNumberStr; Ranges holeNumberRanges; clp.RegisterStringOption("holeNumbers", &holeNumberStr, "A string of comma-delimited hole number ranges to output hits, " "such as '1,2,10-12'. " "This requires hit titles to be in SMRT read title format."); bool parseSmrtTitle = false; clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when filtering alignments generated by " "programs other than blasr, e.g. bwa-sw or gmap. " " Parse read coordinates from the SMRT read title. " "The title is in the format /name/hole/coordinates, where" " coordinates are in the format \\d+_\\d+, and represent " "the interval of the read that was aligned."); /* This experimental option can be useful for metagenomics, in which case * there are hundreds of sequences in the target, of which many titles are * long and may contain white spaces (e.g., ' ', '\t'). * In order to save disc space and avoid the (possibly) none unique mapping * between full and short reference names, one may call blasr with * -titleTable option to represent all target sequences in the output * by their indices in the title table.*/ string titleTableName = ""; clp.RegisterStringOption("titleTable", &titleTableName, "Use this experimental option when filtering alignments generated by " "blasr with -titleTable titleTableName, in which case " "reference titles in SAM are represented by their " "indices (e.g., 0, 1, 2, ...) in the title table."); string adapterGffFileName = ""; clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName, "Use this option to remove reads which can only map to adapters " "specified in the GFF file."); bool verbose = false; clp.RegisterFlagOption("v", &verbose, "Be verbose."); clp.SetExamples( "Because SAM has optional tags that have different meanings" " in different programs, careful usage is required in order " "to have proper output. The \"xs\" tag in bwa-sw is used to " "show the suboptimal score, but in PacBio SAM (blasr) it is " "defined as the start in the query sequence of the alignment.\n" "When \"-smrtTitle\" is specified, the xs tag is ignored, but " "when it is not specified, the coordinates given by the xs and " "xe tags are used to define the interval of a read that is " "aligned. The CIGAR string is relative to this interval."); clp.ParseCommandLine(argc, argv); // Set random number seed. if (seed == 0) { srand(time(NULL)); } else { srand(seed); } scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE; Score s(static_cast<float>(scoreCutoff), scoreSign); FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, minPctAccuracy, true, s); filterCriteria.Verbose(verbose); HitPolicy hitPolicy(hitPolicyStr, scoreSign); string errMsg; if (not filterCriteria.MakeSane(errMsg)) { cout << errMsg << endl; exit(1); } // Parse hole number ranges. if (holeNumberStr.size() != 0) { if (not holeNumberRanges.setRanges(holeNumberStr)) { cout << "Could not parse hole number ranges: " << holeNumberStr << "." << endl; exit(1); } } // Open output file. ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } GFFFile adapterGffFile; if (adapterGffFileName != "") adapterGffFile.ReadAll(adapterGffFileName); SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Filter sam hits."; string program = "samFilter"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); // If the SAM file is generated by blasr with -titleTable, // then references in the SAM are represented by // their corresponding indices in the title table. // In that case, we need to convert reference titles in fasta file // to their corresponding indices in the title table, such that // references in both SAM and fasta files are represented // by title table indices and therefore can match. if (titleTableName != "") { ConvertTitlesToTitleTableIndices(references, titleTableName); } AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet; vector<string> allHeaders = samReader.ReadHeader(alignmentSet); // Process SAM Header. string commandLineString; clp.CommandLineToString(argc, argv, commandLineString); allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \ "\tCL:" + program + " " + commandLineString); for (int i = 0; i < allHeaders.size(); i++) { outFileStrm << allHeaders[i] << endl; } // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that they are ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // Map reference name obtained from SAM file to indices map<string, int> refNameToIndex; for (int i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; int alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // vector<SAMAlignment> allSAMAlignments; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (parseSmrtTitle and holeNumberStr.size() != 0) { string movieName; int thisHoleNumber; if (not ParsePBIReadName(samAlignment.qName, movieName, thisHoleNumber)) { cout << "ERROR, could not parse SMRT title: " << samAlignment.qName << "." << endl; exit(1); } if (not holeNumberRanges.contains(UInt(thisHoleNumber))) { if (verbose) cout << thisHoleNumber << " is not in range." << endl; continue; } } if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process SAM record with 'P' in " << "its cigar string." << endl; continue; } vector<AlignmentCandidate<> > convertedAlignments; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, false); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore multiple segments." << endl; continue; } for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; //score func does not matter DistanceMatrixScoreFunction<DNASequence, DNASequence> distFunc; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distFunc); // Check whether this alignment can only map to adapters in // the adapter GFF file. if (adapterGffFileName != "" and CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) { if (verbose) cout << alignment.qName << " filter adapter only." << endl; continue; } // Assign score to samAlignment. samAlignment.score = samAlignment.as; if (not filterCriteria.Satisfy(static_cast<AlignmentCandidate<> *>(&alignment))) { continue; } allSAMAlignments.push_back( samAlignment ); alignment.FreeSubsequences(); } ++alignIndex; } // Sort all SAM alignments by qName, score and target position. sort(allSAMAlignments.begin(), allSAMAlignments.end(), byQNameScoreTStart); unsigned int groupBegin = 0; unsigned int groupEnd = -1; vector<SAMAlignment> filteredSAMAlignments; while(groupBegin < allSAMAlignments.size()) { // Get the next group of SAM alignments which have the same qName // from allSAMAlignments[groupBegin ... groupEnd) GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd); vector<unsigned int> hitIndices = ApplyHitPolicy( hitPolicy, allSAMAlignments, groupBegin, groupEnd); for(unsigned int i = 0; i < hitIndices.size(); i++) { filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]); } groupBegin = groupEnd; } // Sort all SAM alignments by reference name and query name sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), byRNameQName); for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) { filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm); } if (outFileName != "") { outFileStrm.close(); } #ifdef USE_GOOGLE_PROFILER ProfilerStop(); #endif return 0; }
int main(int argc, char* argv[]) { string program = "samtom4"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, refFileName, outFileName; bool printHeader = false; bool parseSmrtTitle = false; bool useShortRefName = false; CommandLineParser clp; clp.SetProgramName(program); clp.SetVersion(versionString); clp.SetProgramSummary("Converts a SAM file generated by blasr to M4 format."); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file, which is produced by blasr."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate file.sam."); clp.RegisterStringOption("out.m4", &outFileName, "Output in blasr M4 format."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("header", &printHeader, "Print M4 header."); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); //clp.SetExamples(program + " file.sam reference.fasta out.m4"); clp.ParseCommandLine(argc, argv); ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); // // Read necessary input. // vector<FASTASequence> references; fastaReader.ReadAllSequences(references); AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector<FASTASequence> references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector<FASTASequence> references. // alignmentSet.RearrangeReferences(references); // // Map short names for references obtained from file.sam to // full names obtained from reference.fasta // map<string, string> shortRefNameToFull; map<string, string>::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (size_t i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // Map reference name obtained from SAM file to indices map<string, int> refNameToIndex; for (size_t i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; size_t alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // if (printHeader) IntervalOutput::PrintHeader(*outFilePtr); // The socre matrix does not matter because we will use the // aligner's score from SAM file anyway. DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } // The padding character 'P' is not supported if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process sam record with 'P' in its cigar string." << endl; continue; } vector<AlignmentCandidate<> > convertedAlignments; // // Keep reference as forward. // So if IsReverseComplement(sam.flag)==true, then qStrand is reverse // and tStrand is forward. // bool keepRefAsForward = false; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, keepRefAsForward); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore an alignment which has multiple segments." << endl; continue; } //all alignments are unique single-ended alignments. for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distScoreFn); // Use aligner's score from SAM file anyway. alignment.score = samAlignment.as; alignment.mapQV = samAlignment.mapQV; // Since SAM only has the aligned sequence, many info of the // original query (e.g. the full length) is missing. // Overwrite alignment.qLength (which is length of the query // in the SAM alignment) with xq (which is the length of the // original query sequence saved by blasr) right before printing // the output so that one can reconstruct a blasr m4 record from // a blasr sam alignment. if (samAlignment.xq!=0) alignment.qLength = samAlignment.xq; IntervalOutput::PrintFromSAM(alignment, *outFilePtr); alignment.FreeSubsequences(); } ++alignIndex; } if (outFileName != "") { outFileStrm.close(); } return 0; }