Exemple #1
0
int main(int argc, char* argv[]) {
	string refFileName, queryFileName;
	int maxHammingDistance;
	if (argc < 4) {
		cout << "usage: hammer ref query maxHam " << endl;
		exit(1);
	}
	refFileName = argv[1];
	queryFileName = argv[2];
	maxHammingDistance = atoi(argv[3]);

	FASTAReader reader;
	reader.Initialize(refFileName);
	FASTASequence ref, refRC;
	reader.GetNext(ref);
	ref.MakeRC(refRC);
	
	FASTAReader queryReader;
	queryReader.Initialize(queryFileName);
	FASTASequence query;
	queryReader.GetNext(query);
	DNALength p;
	for(p=0; p < ref.length-query.length-1; p++ ){
		DNASequence subseq;
		subseq.seq = &ref.seq[p];
		subseq.length = query.length;
		//		cout << "t "; subseq.PrintSeq(cout);
		//		cout << "q "; ((DNASequence*)&query)->PrintSeq(cout);
		if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) {
			cout << ">" << p << endl;
			subseq.PrintSeq(cout);
		}
		int i;
		for (i =0; i < query.length; i++) {
			subseq.seq[i] = toupper(subseq.seq[i]);
		}
	}

	for(p=0; p < ref.length-query.length-1; p++ ){
		DNASequence subseq;
		subseq.seq = &refRC.seq[p];
		subseq.length = query.length;
		if (HammingDistance(&subseq.seq[0], &query.seq[0], query.length) < maxHammingDistance) {
			cout << ">" << p << "rc" << endl;
			subseq.PrintSeq(cout);
		}
		int i;
		for (i =0; i < query.length; i++) {
			subseq.seq[i] = toupper(subseq.seq[i]);
		}
	}

}
Exemple #2
0
int main(int argc, char* argv[]) {
	string genomeFileName, subseqFileName;
	if (argc != 3) {
		cout << "usage: extractRepeats genome repeat" << endl;
		exit(0);
	}

	genomeFileName = argv[1];
	subseqFileName = argv[2];

	FASTASequence genome, sub;
	FASTAReader reader;
	reader.Init(genomeFileName);
	reader.GetNext(genome);
	reader.Init(subseqFileName);
	reader.GetNext(sub);

	genome.ToUpper();
	sub.ToUpper();	
	DNALength genomePos;
	FASTASequence genomeSub;
	int kband = (int) (0.15) * sub.length;
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int readIndex = 0;
	cout << "starting extraction" << endl;
	for (genomePos = 0; genomePos < genome.length - sub.length + 1; genomePos++) {
		genomeSub.seq = &genome.seq[genomePos];
		genomeSub.length = sub.length;
		int alignScore;
		Alignment alignment;
		alignScore = SWAlign(genomeSub, sub,
												 EditDistanceMatrix, 1, //1,kband,
												 scoreMat, pathMat,
												 alignment, QueryFit);
												 
		if (alignScore < 0.25 * sub.length) {
			stringstream titlestrm;
			titlestrm << readIndex << "|" 
								 << genomePos << "|"
								<< genomePos + sub.length << " " << alignScore/ (1.0*sub.length);
			FASTASequence subcopy;
			subcopy.CopyTitle(titlestrm.str());
			subcopy.seq = &genome.seq[genomePos];
			subcopy.length = sub.length;
			subcopy.PrintSeq(std::cout);
			genomePos += sub.length;
		}
	}
}
int main(int argc, char* argv[]) {
	if (argc < 3) {
		cout << "usage: testBuildOccBins genomeFileName suffixArray" << endl;
		exit(0);
	}
	string genomeFileName      = argv[1];
	string suffixArrayFileName = argv[2];
	
	FASTAReader reader;
	reader.Init(genomeFileName);
	FASTASequence seq;
	reader.GetNext(seq);
	

	DNASuffixArray suffixArray;
	
	suffixArray.Read(suffixArrayFileName);

	Bwt<PackedDNASequence, FASTASequence> bwt;
	//bwt.InitializeFromSuffixArray(seq, suffixArray.index);

	bwt.InitializeBWTStringFromSuffixArray(seq, suffixArray.index);
	bwt.occ.Initialize(bwt.bwtSequence, 4096, 64);
	bwt.occ.PrintBins(cout);
}
int main(int argc, char* argv[]) {

	string seqFileName;
	TupleMetrics tm;
	string outFileName;
	if (argc < 3) {
		cout << "usage: storeTuplePosList seqFile tupleSize outFile" << endl;
		return 0;
	}
	seqFileName = argv[1];
	tm.tupleSize = atoi(argv[2]);
	outFileName = argv[3];
	
	ofstream outFile;
	//	CrucialOpen(outFileName, outFile, std::ios::out| std::ios::binary);

	FASTAReader reader;
	reader.Init(seqFileName);
	FASTASequence seq;
	reader.GetNext(seq);
	//	vector<PositionDNATuple> 
	TupleList<PositionDNATuple>tuplePosList;
	tuplePosList.SetTupleMetrics(tm);
	//	StoreTuplePosList(seq, tm, tuplePosList);
	SequenceToTupleList(seq, tm, tuplePosList);
	tuplePosList.Sort();
	tuplePosList.WriteToFile(outFileName); //WriteTuplePosList(tuplePosList, tm.tupleSize, outFile);
	outFile.close();
	return 0;
}
Exemple #5
0
int main(int argc, char* argv[]) {
  string inFileName, outFileName;
  int length;
  inFileName = argv[1];
  outFileName = argv[2];
  length = atoi(argv[3]);

  int argi = 4;
  int stride = 0;
  float coverage = 0;
  while (argi < argc) {
    if (strcmp(argv[argi], "-stride")) {
      stride = atoi(argv[++argi]);
    }
    else if (strcmp(argv[argi], "-coverage")) {
      coverage = atof(argv[++argi]);
    }
    ++argi;
  }

  FASTAReader reader;
  reader.Initialize(inFileName);
  FASTASequence genome;
  reader.GetNext(genome);
  if (stride == 0 and coverage == 0) {
    cout << "ERROR, must provide stride or coverage. " << endl;
    exit(1);
  }
  if (stride == 0) {
    stride = genome.length * coverage / length;
  }
int main(int argc, char* argv[]) {
	if (argc < 3) {
		cout << "usage: bwtLocateList bwtName querySeqFile" << endl;
		exit(1);
	}
	string bwtFileName      = argv[1];
	string querySeqFileName = argv[2];
	bool doPrintResults = false;
	int maxCount = 0;
	int argi = 3;
	bool countOnly = false;
	while(argi < argc) {
		if (strcmp(argv[argi], "-print") == 0) {
			doPrintResults = true;
		}
		else if (strcmp(argv[argi], "-max") == 0) {
			maxCount = atoi(argv[++argi]);
		}
		else if (strcmp(argv[argi], "-count") == 0) {
			countOnly = true;
		}
		else {
			cout << "bad option: " << argv[argi] << endl;
		}
		++argi;
	}

 	Bwt<PackedDNASequence, FASTASequence> bwt;
	bwt.Read(bwtFileName);

	FASTAReader queryReader;
	queryReader.Init(querySeqFileName);
	FASTASequence seq;
	int seqIndex = 0;
	vector<DNALength> positions;
	while(queryReader.GetNext(seq)) {
		positions.clear();
		if (countOnly == false) {
			bwt.Locate(seq, positions, maxCount);
		}
		else {
			DNALength sp,ep;
			bwt.Count(seq, sp, ep);
		}
		//		cout << "matched " << positions.size() << " positions." << endl;
		if (doPrintResults) {
			int i;
			for (i = 0; i < positions.size(); i++ ){
				cout << positions[i] << " ";
			}
			cout << endl;
		}
		++seqIndex;
	}
		//	float wordCountsPerLookup = (bwt.bwtSequence.nCountInWord *1.0) / bwt.bwtSequence.nCountNuc;
		//	cout << "word counts per lookup: " << wordCountsPerLookup << endl;
	return 0;
}
Exemple #7
0
int main(int argc, char* argv[])
{
    std::string seqInName, seqOutName, dotOutName;
    if (argc < 4) {
        std::cout << "usage: exciseRepeats inName repMaskOutFile outName" << std::endl;
        std::exit(EXIT_FAILURE);
    }

    seqInName = argv[1];
    dotOutName = argv[2];
    seqOutName = argv[3];
    FASTAReader reader;
    reader.Initialize(seqInName);
    FASTASequence origSeq;
    reader.GetNext(origSeq);

    std::ifstream dotOutFile;
    CrucialOpen(dotOutName, dotOutFile);
    std::ofstream seqOutFile;
    std::ofstream seqOut;
    CrucialOpen(seqOutName, seqOut, std::ios::out);
    std::string dotOutLine;
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    getline(dotOutFile, dotOutLine);
    while (getline(dotOutFile, dotOutLine)) {
        std::stringstream lineStrm(dotOutLine);
        int swScore;
        float pctDiv, pctDel, pctIns;
        std::string query;
        DNALength qPosBegin, qPosEnd;
        std::string left;
        char strand;
        std::string matchingRepeat;
        std::string repClass;
        std::string repPos, repEnd, repLeft;
        int id;
        lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >>
            left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id;
        for (DNALength seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) {
            origSeq.seq[seqPos] = 'X';
        }
    }

    DNALength seqPos, unexPos;
    unexPos = 0;
    for (seqPos = 0; seqPos < origSeq.length; seqPos++) {
        if (origSeq.seq[seqPos] != 'X') {
            origSeq.seq[unexPos] = origSeq.seq[seqPos];
            unexPos++;
        }
    }
    origSeq.length = unexPos;

    origSeq.PrintSeq(seqOut);
    return 0;
}
int main(int argc, char *argv[]) {
	string sequencesInName, sequencesOutName;
	if (argc <3){ 
		cout << "usage: scramble in out" << endl;
		exit(1);
	}
	sequencesInName = argv[1];
	sequencesOutName= argv[2];
	vector<FASTASequence*> sequences;
	vector<int> sequenceIndices;

	FASTAReader reader;
	reader.Init(sequencesInName);
	ofstream out;
	CrucialOpen(sequencesOutName, out, std::ios::out);
	

	FASTASequence read;
	FASTASequence*readPtr;
	while(reader.GetNext(read)) {
		readPtr = new FASTASequence;
		*readPtr = read;
		sequences.push_back(readPtr);
	}

	int i;
	for (i = 0; i < sequences.size(); i++) {
		sequenceIndices.push_back(i);
	}

	for (i = 0; i < 10*sequences.size(); i++ ){
		//
		// shuffle indices.
		//
		int idx1;
		int idx2;
		idx1 = RandomInt(sequences.size());
		idx2 = RandomInt(sequences.size());
		int tmp;
		tmp  = sequenceIndices[idx1];
		sequenceIndices[idx1] = sequenceIndices[idx2];
		sequenceIndices[idx2] = tmp;
	}
	
	for (i = 0; i < sequenceIndices.size(); i++ ){
		sequences[sequenceIndices[i]]->PrintSeq(out);
	}
	return 0;
}
Exemple #9
0
int main(int argc, char* argv[]) {

	CommandLineParser clp;
	string fastaFileName, indexFileName;
	vector<string> fastaFileNames;
	vector<string> opts;
	clp.SetProgramName("bsdb");
	clp.SetProgramSummary("Build an index database on a file of sequences.\n"
												" The index is used to map to reads given alignment positions.\n");
	clp.RegisterStringOption("fasta", &fastaFileName, "A file with sequences to build an index.");
	clp.RegisterStringOption("index", &indexFileName, "The index file.");
	clp.RegisterPreviousFlagsAsHidden();

	clp.ParseCommandLine(argc, argv, opts);

	ifstream fastaIn;
	ofstream indexOut;

	if (FileOfFileNames::IsFOFN(fastaFileName)) {
		FileOfFileNames::FOFNToList(fastaFileName, fastaFileNames);
	}
	else {
		fastaFileNames.push_back(fastaFileName);
	}

	CrucialOpen(indexFileName, indexOut, std::ios::out | std::ios::binary);
	SequenceIndexDatabase<FASTASequence> seqDB;
		
	int fileNameIndex;
	for (fileNameIndex = 0; fileNameIndex < fastaFileNames.size(); fileNameIndex++){ 
		FASTAReader reader;
		FASTASequence seq;
		reader.Init(fastaFileNames[fileNameIndex]);
		int i = 0;
		while (reader.GetNext(seq)) {
			seqDB.AddSequence(seq);
			i++;
		}
	}
	seqDB.Finalize();
	seqDB.WriteDatabase(indexOut);
	return 0;
}
Exemple #10
0
int main(int argc, char* argv[]) {
		if (argc < 4) {
			cout << "usage: splitContigs in.fa contiglength out" << endl;
			exit(1);
		}
		string inFileName, outFileName;
		inFileName = argv[1];
		int contigLength = atoi(argv[2]);		
		outFileName = argv[3];

		ofstream seqOut;
		CrucialOpen(outFileName, seqOut, std::ios::out);
		FASTAReader reader;
		reader.Init(inFileName);
		FASTASequence seq;
		DNALength curOffset;
		
		while(reader.GetNext(seq)) {
			FASTASequence subseq;
			int i;
			curOffset = 0;
			for (i =0 ; i < seq.length / contigLength + 1; i++ ) {
				subseq.seq = &seq.seq[curOffset];
				subseq.title = seq.title;
				if (curOffset + contigLength > seq.length) {
					subseq.length = seq.length - curOffset;
				}
				else {
					subseq.length = contigLength;
				}
				subseq.PrintSeq(seqOut);
				curOffset += contigLength;
			}
		}
		return 0;
}
int main(int argc, char* argv[1]) {
	if (argc < 3) {
		cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl;
		cout << "  genome.fasta.sa must exist." << endl;
		cout << "  Finds sequences at least effective_k in length that are unique." << endl;
		cout << "  -max m       Allow up to m matches" << endl;
		cout << "  -minLength l Ensure the length of the match is at least this." << endl;
		cout << "  -prefix p n  Allow up to n matches across a prefix of length p" << endl;
		cout << "  -suffix s n  Allow up to n matches across a suffix of length s" << endl;
		cout << "               Prefix and suffix options override max." << endl;
		cout << "  -out file    Print queries to this output file (query.fasta.queries)" << endl;
		exit(0);
	}

	DNASuffixArray sarray;
	
	string genomeFileName = argv[1];
	string suffixArrayFileName = genomeFileName + ".sa";
	
	FASTAReader reader;
	FASTASequence genome;

	int maxN = 0;

	int prefix = 0;
	int suffix = 0;
	int prefixN = 0;
	int suffixN = 0;
	int argi = 4;
	string outputFileName = "";
	int minLength = 0;
	while (argi < argc) {
		if (strcmp(argv[argi], "-max") == 0) {
			++argi;
			maxN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-prefix") == 0) {
			++argi;
			prefix = atoi(argv[argi]);
			++argi;
			prefixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-suffix") == 0) {
			++argi;
			suffix = atoi(argv[argi]);
			++argi;
			suffixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-out") == 0) {
			++argi;
			outputFileName = argv[argi];
		}
		else if (strcmp(argv[argi], "-minLength") == 0) {
			++argi;
			minLength = atoi(argv[argi]);
		}
		++argi;
	}

	reader.Initialize(genomeFileName);
	reader.ReadAllSequencesIntoOne(genome);
	sarray.Read(suffixArrayFileName);

	FASTAReader queryReader;
	FASTASequence querySequence;
	string queryFileName = argv[2];
	int maxLength = atoi(argv[3]);
	string summaryTableFileName = queryFileName + ".summary";
	if (outputFileName == "") {
		outputFileName = queryFileName + ".queries";
	}
		
	
	ofstream summaryTable(summaryTableFileName.c_str());
	ofstream outputFile(outputFileName.c_str());

	queryReader.Initialize(queryFileName);

	while (queryReader.GetNext(querySequence)) {
		int i;
		cerr << "searching " << querySequence.title << endl;
		if (querySequence.length < maxLength) {
			continue;
		}

		int nMatches = 0;
		querySequence.ToUpper();
		int localMax;
		for (i = 0; i < querySequence.length - maxLength + 1; i++) {
			if ((i + 1) % 100000 == 0) {
				cerr << "processed: " << i + 1 << endl;
			}

			int lcpLength;
			vector<SAIndex> lcpLeftBounds, lcpRightBounds;
			vector<SAIndex> rclcpLeftBounds, rclcpRightBounds;
			localMax = maxN;
			if (i < prefix) {
				localMax = prefixN;
			}
			if (i >= querySequence.length - suffix) {
				localMax = suffixN;
			}
			if (querySequence.length - i <= maxLength) {
				continue;
			}
			if (querySequence.seq[i] == 'N') {
				continue;
			}
			lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																				&querySequence.seq[i], querySequence.length-i,
																				true,
																				maxLength,
																				lcpLeftBounds, lcpRightBounds,
																				false);
			if (lcpLength < minLength) {
				continue;
			}
			if (lcpLength < maxLength or 
					lcpRightBounds.size() == 0 or 
					(lcpRightBounds.size() > 0 and 
					 lcpLeftBounds.size() > 0 and  
					 lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) {

				FASTASequence rc;
				DNASequence subseq;
				subseq.ReferenceSubstring(querySequence, i, maxLength);
				subseq.MakeRC(rc);
				int rclcpLength;
				int numForwardMatches;
				if (lcpLength == 0) {
					numForwardMatches = 0;
				}
				else {
					numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1];
				}
				rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																						rc.seq, maxLength,
																						true,
																						rclcpLength,
																						rclcpLeftBounds, rclcpRightBounds,
																						false);

				string rcstr((const char*)rc.seq, rc.length);

				if (rclcpLength < maxLength or 
						rclcpRightBounds.size() == 0 or
						(numForwardMatches + 
						 rclcpRightBounds[rclcpRightBounds.size() - 1] -
						 rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) 
					{
						char* substr = new char[maxLength+1];
						substr[maxLength] = '\0';
						memcpy(substr, &querySequence.seq[i], maxLength);

						//						string substr = string((const char*) querySequence.seq, i, maxLength);
						
						outputFile << querySequence.title << "\t" << substr << "\t" << i << endl;

						++nMatches;
						delete[] substr;
						//					}
					}
				rc.Free();
			}

		}
		summaryTable << querySequence.title << "\t" << nMatches << endl;
		querySequence.Free();
	}
	outputFile.close();
	genome.Free();
}
Exemple #12
0
int main(int argc, char* argv[]) {
	string ad1File, ad2File, readsFile, readsOutFile;

	FASTAReader ad1Reader;
	FASTAReader ad2Reader;
	FASTAReader reader;


	CommandLineParser cl;
	float minPctSimilarity = 0.60;
	int indel = 3;
	int minLength = 10;
	cl.RegisterStringOption("ad1", &ad1File, "FASTA file with the first adapter");
	cl.RegisterStringOption("ad2", &ad2File, "FASTA file with the second adapter");
	cl.RegisterStringOption("reads", &readsFile, "FASTA file with SMRTBell reads");
	cl.RegisterStringOption("readsout", &readsOutFile, "output file for split reads");
	cl.RegisterPreviousFlagsAsHidden();
	cl.RegisterFloatOption("pctSim", &minPctSimilarity, "Minimum percent similarity to trigger a match to an adapter.", 
												 CommandLineParser::PositiveFloat);
	cl.RegisterIntOption("indel", &indel, "Penalty for indel (positive)", CommandLineParser::NonNegativeInteger);
	cl.RegisterIntOption("minLength", &minLength, "Minimum length pass to retain.", CommandLineParser::PositiveInteger);
	vector<string> opts;
	cl.ParseCommandLine(argc, argv, opts);

	/*
	 * Open all the required files, quitting if they are unavailable.
	 */

	ad1Reader.Init(ad1File);
	ad2Reader.Init(ad2File);
	reader.Init(readsFile);

	ofstream splitOut;
	CrucialOpen(readsOutFile, splitOut);

	FASTASequence ad1, ad2;
	ad1Reader.GetNext(ad1);
	ad2Reader.GetNext(ad2);

	FASTASequence read;
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int readIndex = 0;
	while(reader.GetNext(read)) {
		read.ToUpper();
		//
		// Do a fitting sequence alignment to match one of the two 
		// adapters into the read.
		//
		vector<int> passStarts, passLengths, la;
		read.PrintSeq(cout);
		SplitRead(read, 0, read.length, ad1, ad2,
							indel, 
							passStarts, passLengths,la, 0,
							scoreMat, pathMat, minPctSimilarity, minLength);
		int i;
		for (i = 0; i < passStarts.size(); i++) {
			cout << "read: " << readIndex << " pass: "******" " << passStarts[i] << " " << passLengths[i] << " " << la[i] << endl;
		}
		++readIndex;
	}
}
Exemple #13
0
int main(int argc, char* argv[]) {
	CommandLineParser clp;

	string refGenomeName;
	string mutGenomeName;
  string gffFileName;
	float insRate = 0;
	float delRate = 0;
	float mutRate = 0;
  bool  lower = false;
  gffFileName = "";
	clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true);
	clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true);
	clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome.");
	clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", 
													CommandLineParser::NonNegativeFloat, false);
	clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", 
													CommandLineParser::NonNegativeFloat, false);
	clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", 
													CommandLineParser::NonNegativeFloat, false);
  clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false);
	vector<string> leftovers;
	clp.ParseCommandLine(argc, argv, leftovers);
  
	FASTAReader reader;
	FASTASequence refGenome;

	reader.Init(refGenomeName);
	ofstream mutGenomeOut;
	CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out);
  ofstream gffOut;
  if (gffFileName != "") {
    CrucialOpen(gffFileName, gffOut, std::ios::out);
  }

	vector<int> insIndices, delIndices, subIndices;
	int readIndex = 0;
	InitializeRandomGeneratorWithTime();
	while (reader.GetNext(refGenome)) {
		insIndices.resize(refGenome.length);
		delIndices.resize(refGenome.length);
    subIndices.resize(refGenome.length);
		std::fill(insIndices.begin(), insIndices.end(), false);
		std::fill(delIndices.begin(), delIndices.end(), false);
    std::fill(subIndices.begin(), subIndices.end(), 0);

		enum ChangeType { Ins, Del, Mut, None};
		float changeProb[4];
		changeProb[Ins] = insRate;
		changeProb[Del] = changeProb[Ins] + delRate;
		changeProb[Mut] = changeProb[Del] + mutRate;
		changeProb[None] = 1;

		if (changeProb[Mut] > 1) {
			cout << "ERROR! The sum of the error probabilities must be less than 1" << endl;
			exit(1);
		}
		DNALength pos;
		float randomNumber;
		int numIns = 0;
		int numDel = 0;
		int numMut = 0;
		for (pos =0 ; pos < refGenome.length; pos++) { 
			randomNumber = Random();
			if (randomNumber < changeProb[Ins]) {
				insIndices[pos] = true;
				numIns++;
			}
			else if (randomNumber < changeProb[Del]) {
				delIndices[pos] = true;
				numDel++;
			}
			else if (randomNumber < changeProb[Mut]){ 
				Nucleotide newNuc = TwoBitToAscii[RandomInt(4)];
				int maxIts = 100000;
				int it = 0;
				while (newNuc == refGenome.seq[pos]) {
					newNuc = TwoBitToAscii[RandomInt(4)];
					if (it == maxIts) {
						cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl;
						exit(1);
					}
				}
        subIndices[pos] = refGenome[pos];
				refGenome.seq[pos] = ToLower(newNuc,lower);
				++numMut;
			}
		}
		//		cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl;
		if (readIndex % 100000 == 0 && readIndex > 0) {
			cout << readIndex << endl;
		}
		// 
		// Now add the insertions and deletions.
		//
		FASTASequence newSequence;
		DNALength   newPos;
		if (numIns - numDel + refGenome.length < 0) {
			cout << "ERROR, the genome has been deleted to nothing." << endl;
			exit(1);
		}
		ResizeSequence(newSequence, refGenome.length + (numIns - numDel));
		newPos = 0;
		pos = 0;
		for (pos = 0; pos < refGenome.length; pos++) {
			assert(newPos < newSequence.length or delIndices[pos] == true);
      if (subIndices[pos] != 0 and gffFileName != "") {
        gffOut << refGenome.GetName() << "	.	SNV	" << newPos << " " << newPos <<" 0.00	.	.	reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl;
      }
        
			if (insIndices[pos] == true) {
				newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower);
				newPos++;
				newSequence.seq[newPos] = refGenome.seq[pos];
        
				assert(newSequence.seq[newPos] != '1');
				assert(newSequence.seq[newPos] != 1);
        if (gffFileName != "") {
          gffOut << refGenome.GetName() << "	.	deletion	" << newPos << " " << newPos << " 0.00	.	.	reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl;
        }
				newPos++;
			}
			else if (delIndices[pos] == true) {
				// no-op, skip
        if (gffFileName != "") {
          gffOut << refGenome.GetName() << "	.	insertion	" << newPos << " " << newPos << " 0.00	.	.	confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl;
//ref000001	.	deletion	20223	20223	0.00	.	.	reference=T;length=1;confidence=0;coverage=0;Name=20222delT
        }
			}
			else {
				newSequence.seq[newPos] = refGenome.seq[pos];
				newPos++;
			}
		}
		stringstream titlestrm;
		titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate;
		newSequence.CopyTitle(refGenome.title);
		newSequence.AppendToTitle(titlestrm.str());
		newSequence.PrintSeq(mutGenomeOut);
    newSequence.Free();
		readIndex++;
	}
}
Exemple #14
0
int main(int argc, char* argv[]) {
  FASTAReader reader;
  if (argc < 5) {
	cout << "usage: wordCounter seqFile tupleSize tupleOutputFile posOutputFile" << endl;
	exit(1);
  }

  string fileName = argv[1];
  int    tupleSize = atoi(argv[2]);
  string tupleListName = argv[3];
	string posOutName    = argv[4];
  
	TupleMetrics tm;
  tm.Initialize(tupleSize);
  reader.Init(fileName);

  FASTASequence seq;
  reader.GetNext(seq);

  vector<CountedDNATuple> tupleList;
  CountedDNATuple tuple;
  DNALength i;
  for (i = 0; i < seq.length - tm.tupleSize + 1; i++ ) {
		if (tuple.FromStringRL((Nucleotide*) (seq.seq + i), tm)) {
			tuple.count = i;
			tupleList.push_back(tuple);
		}
  }

  std::sort(tupleList.begin(), tupleList.end());

  int t;
  int t2;
  int numTuples = tupleList.size();
  t = t2 = 0;
  int numUnique = 0;
  while (t < numTuples) {
	t2 = t;
	t2++;
	while (t2 < numTuples and tupleList[t] == tupleList[t2]) {
	  t2++;
	}
	++numUnique;
	t = t2;
  }

  ofstream countedTupleListOut;
  countedTupleListOut.open(tupleListName.c_str(), ios_base::binary);

	ofstream posOut;
	posOut.open(posOutName.c_str(), ios_base::binary);

  countedTupleListOut.write((const char*) &numUnique, sizeof(int));
  countedTupleListOut.write((const char*) &tm.tupleSize, sizeof(int));

  posOut.write((const char*) &numUnique, sizeof(int));

	//
	// Write out the tuple+counts to a file.
	//
  t = t2 = 0;
  CountedDNATuple countedTuple;
	int numMultOne = 0;
  while (t < numTuples) {
		t2 = t;
		t2++;
		while (t2 < numTuples and tupleList[t] == tupleList[t2]) {
			t2++;
		}
		countedTuple.tuple = tupleList[t].tuple;
		countedTuple.count = t2 - t;
		if (countedTuple.count == 1) ++numMultOne;
		countedTupleListOut.write((const char*) &countedTuple,sizeof(CountedDNATuple));
		
		posOut.write((char*)&countedTuple.count, sizeof(int));
		
		int tc;
		for (tc = t; tc < t2; tc++) {
			posOut.write((char*) &tupleList[tc].count, sizeof(int));
		}
		t = t2;
  }

	//
	// Write out the positions of the tuples to a file.
	//
	
	posOut.close();
	countedTupleListOut.close();

	//  cout << "found " << numUnique << " distinct " << DNATuple::TupleSize << "-mers." << endl;
	cout << numMultOne << endl;
  return 0;
}
int main(int argc, char* argv[]) {

	string cmpFileName;
	string refFileName;
	string readsFileName;
  string mapqvTrackName;
	if (argc < 2) {
		cout << "  printMapqvTrack: print a gff file of the average mapping quality value" << endl;
		exit(1);
	}
	vector<int> refPositions;
	cmpFileName = argv[1];
	refFileName = argv[2];
  mapqvFileName = argv[3];

	CmpFile cmpFile;
	FASTASequence ref;
	FASTAReader reader;

	reader.Initialize(refFileName);
	reader.GetNext(ref);

	HDFBasReader basReader;

	SMRTSequence seq, *seqPtr;

	vector<int> refCoverage;
	refCoverage.resize(ref.length);
	std::fill(refCoverage.begin(), refCoverage.end(), 0);
	/*
	 * These guys pull information from the same pls file.
	 */
	HDFCmpReader<CmpAlignment> cmpReader;


	if (cmpReader.Initialize(cmpFileName) == 0) {
		cout << "ERROR, could not open the cmp file." << endl;
		exit(1);
	}
	
	
	cmpReader.Read(cmpFile);
	UInt alignmentIndex;

	//	movieIndexSets.resize(nMovies);
	for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) {
		int refSeqId    = cmpFile.alnInfo.alignments[alignmentIndex].GetRefSeqId();
		int readGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetReadGroupId();
		int refSeqIdIndex;
		if (cmpFile.refSeqTable.GetIndexOfId(refSeqId, refSeqIdIndex) == false) {
			//
			// Sanity check -- we're only looking at alignments to references in the cmp file.
			//
			cout << "ERROR, ref seq id: " << refSeqId << " should exist in the cmp file but it does not." << endl;
			assert(0);
		}

		int readGroupIdIndex;
		cmpFile.readGroupTable.GetIndexOfId(readGroupId, readGroupIdIndex);
		
		string readGroupPath    = cmpFile.readGroupTable.names[readGroupIdIndex];
		string readGroup        = cmpReader.readGroupPathToReadGroup[readGroupPath];
		int readGroupArrayIndex = cmpReader.refAlignGroups[refSeqIdIndex]->experimentNameToIndex[readGroup];
		vector<char> alignedSequence, alignedTarget;

		//
		// This read overlaps one of the ref positions.
		
		UInt offsetEnd, offsetBegin;
				
		offsetEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd();
		offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin();
		vector<unsigned char> byteAlignment;
		int alignedSequenceLength = offsetEnd - offsetBegin;
		if (alignedSequenceLength >= 0) {
			alignedSequence.resize(alignedSequenceLength);
			alignedTarget.resize(alignedSequenceLength);
			byteAlignment.resize(alignedSequenceLength);
		}

		cmpReader.refAlignGroups[refSeqIdIndex]->readGroups[readGroupArrayIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]);
		UInt refStart = cmpFile.alnInfo.alignments[alignmentIndex].GetRefStart();
		UInt refEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetRefEnd();
		UInt readStart= cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart();
		UInt readEnd  = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd();
		//
		// Read the alignment string.
		//
		if (refSeqIdIndex > 0) continue;


		

		
		//
		// Convert to something we can compare easily.
		//
		alignedSequence[alignedSequence.size()-1]= '\0';
		ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]);
		ByteAlignmentToRefString(&byteAlignment[0], byteAlignment.size(), &alignedTarget[0]);
		int gi, i;
		gi = 0;
		int refStrand =  cmpFile.alnInfo.alignments[alignmentIndex].GetRCRefStrand();
		if (refStrand == 1) {
			// revcomp the ref strand
			vector<char> rcAlignedTarget, rcAlignedQuery;
			int t;
			rcAlignedTarget.resize(alignedTarget.size());
			rcAlignedQuery.resize(alignedSequence.size());
			for (t = 0; t < alignedTarget.size(); t++) {
				if (alignedTarget[t] == ' ') {
					rcAlignedTarget[alignedTarget.size() - t - 1] = ' ';
				}
				else {
					rcAlignedTarget[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]];
				}
				if (alignedSequence[t] == ' '){ 
					rcAlignedQuery[alignedTarget.size()  - t - 1] = ' ';
				}
				else {
					rcAlignedQuery[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]];
				}
			}
			alignedTarget = rcAlignedTarget;
			alignedSequence = rcAlignedQuery;
		}
		
		int holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber();
		int ri = readStart;

		gi = refStart;

		for (i = 0; i < alignedTarget.size(); i++, gi++, ri++ ) {
			while(i < alignedTarget.size() and alignedTarget[i] == ' ') { 
				i++; 
			}
			if (alignedSequence[i] != ' ') {
				refCoverage[gi]++;
			}
		}
	} // end looping over regions

// Now compute the number of gaps.
	UInt pos;
	int numNotCovered = 0;
	for (pos = 0; pos < refCoverage.size(); pos++ ){
		if (refCoverage[pos] < 1) { numNotCovered++;}
	}
	if (numNotCovered > 100) {
		cout << "TOO Many!!!" << endl;
	}
	else {
		for (pos = 0; pos < refCoverage.size(); pos++ ){
			//		cout << refCoverage[pos] << endl;
			if (refCoverage[pos] < 1) {
				int left, right;
				left = right = -1;
				if (pos > 0) { left = refCoverage[pos-1];}
				if (pos < refCoverage.size()-1) {right = refCoverage[pos+1];}
				cout << pos << " " << left << " " << right << endl;
			}
		}
	}

}