Exemplo n.º 1
0
int main(int argc, char* argv[]) {
	string ad1File, ad2File, readsFile, readsOutFile;

	FASTAReader ad1Reader;
	FASTAReader ad2Reader;
	FASTAReader reader;


	CommandLineParser cl;
	float minPctSimilarity = 0.60;
	int indel = 3;
	int minLength = 10;
	cl.RegisterStringOption("ad1", &ad1File, "FASTA file with the first adapter");
	cl.RegisterStringOption("ad2", &ad2File, "FASTA file with the second adapter");
	cl.RegisterStringOption("reads", &readsFile, "FASTA file with SMRTBell reads");
	cl.RegisterStringOption("readsout", &readsOutFile, "output file for split reads");
	cl.RegisterPreviousFlagsAsHidden();
	cl.RegisterFloatOption("pctSim", &minPctSimilarity, "Minimum percent similarity to trigger a match to an adapter.", 
												 CommandLineParser::PositiveFloat);
	cl.RegisterIntOption("indel", &indel, "Penalty for indel (positive)", CommandLineParser::NonNegativeInteger);
	cl.RegisterIntOption("minLength", &minLength, "Minimum length pass to retain.", CommandLineParser::PositiveInteger);
	vector<string> opts;
	cl.ParseCommandLine(argc, argv, opts);

	/*
	 * Open all the required files, quitting if they are unavailable.
	 */

	ad1Reader.Init(ad1File);
	ad2Reader.Init(ad2File);
	reader.Init(readsFile);

	ofstream splitOut;
	CrucialOpen(readsOutFile, splitOut);

	FASTASequence ad1, ad2;
	ad1Reader.GetNext(ad1);
	ad2Reader.GetNext(ad2);

	FASTASequence read;
	vector<int> scoreMat;
	vector<Arrow> pathMat;
	int readIndex = 0;
	while(reader.GetNext(read)) {
		read.ToUpper();
		//
		// Do a fitting sequence alignment to match one of the two 
		// adapters into the read.
		//
		vector<int> passStarts, passLengths, la;
		read.PrintSeq(cout);
		SplitRead(read, 0, read.length, ad1, ad2,
							indel, 
							passStarts, passLengths,la, 0,
							scoreMat, pathMat, minPctSimilarity, minLength);
		int i;
		for (i = 0; i < passStarts.size(); i++) {
			cout << "read: " << readIndex << " pass: "******" " << passStarts[i] << " " << passLengths[i] << " " << la[i] << endl;
		}
		++readIndex;
	}
}
Exemplo n.º 2
0
int main(int argc, char* argv[1]) {
	if (argc < 3) {
		cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl;
		cout << "  genome.fasta.sa must exist." << endl;
		cout << "  Finds sequences at least effective_k in length that are unique." << endl;
		cout << "  -max m       Allow up to m matches" << endl;
		cout << "  -minLength l Ensure the length of the match is at least this." << endl;
		cout << "  -prefix p n  Allow up to n matches across a prefix of length p" << endl;
		cout << "  -suffix s n  Allow up to n matches across a suffix of length s" << endl;
		cout << "               Prefix and suffix options override max." << endl;
		cout << "  -out file    Print queries to this output file (query.fasta.queries)" << endl;
		exit(0);
	}

	DNASuffixArray sarray;
	
	string genomeFileName = argv[1];
	string suffixArrayFileName = genomeFileName + ".sa";
	
	FASTAReader reader;
	FASTASequence genome;

	int maxN = 0;

	int prefix = 0;
	int suffix = 0;
	int prefixN = 0;
	int suffixN = 0;
	int argi = 4;
	string outputFileName = "";
	int minLength = 0;
	while (argi < argc) {
		if (strcmp(argv[argi], "-max") == 0) {
			++argi;
			maxN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-prefix") == 0) {
			++argi;
			prefix = atoi(argv[argi]);
			++argi;
			prefixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-suffix") == 0) {
			++argi;
			suffix = atoi(argv[argi]);
			++argi;
			suffixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-out") == 0) {
			++argi;
			outputFileName = argv[argi];
		}
		else if (strcmp(argv[argi], "-minLength") == 0) {
			++argi;
			minLength = atoi(argv[argi]);
		}
		++argi;
	}

	reader.Initialize(genomeFileName);
	reader.ReadAllSequencesIntoOne(genome);
	sarray.Read(suffixArrayFileName);

	FASTAReader queryReader;
	FASTASequence querySequence;
	string queryFileName = argv[2];
	int maxLength = atoi(argv[3]);
	string summaryTableFileName = queryFileName + ".summary";
	if (outputFileName == "") {
		outputFileName = queryFileName + ".queries";
	}
		
	
	ofstream summaryTable(summaryTableFileName.c_str());
	ofstream outputFile(outputFileName.c_str());

	queryReader.Initialize(queryFileName);

	while (queryReader.GetNext(querySequence)) {
		int i;
		cerr << "searching " << querySequence.title << endl;
		if (querySequence.length < maxLength) {
			continue;
		}

		int nMatches = 0;
		querySequence.ToUpper();
		int localMax;
		for (i = 0; i < querySequence.length - maxLength + 1; i++) {
			if ((i + 1) % 100000 == 0) {
				cerr << "processed: " << i + 1 << endl;
			}

			int lcpLength;
			vector<SAIndex> lcpLeftBounds, lcpRightBounds;
			vector<SAIndex> rclcpLeftBounds, rclcpRightBounds;
			localMax = maxN;
			if (i < prefix) {
				localMax = prefixN;
			}
			if (i >= querySequence.length - suffix) {
				localMax = suffixN;
			}
			if (querySequence.length - i <= maxLength) {
				continue;
			}
			if (querySequence.seq[i] == 'N') {
				continue;
			}
			lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																				&querySequence.seq[i], querySequence.length-i,
																				true,
																				maxLength,
																				lcpLeftBounds, lcpRightBounds,
																				false);
			if (lcpLength < minLength) {
				continue;
			}
			if (lcpLength < maxLength or 
					lcpRightBounds.size() == 0 or 
					(lcpRightBounds.size() > 0 and 
					 lcpLeftBounds.size() > 0 and  
					 lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) {

				FASTASequence rc;
				DNASequence subseq;
				subseq.ReferenceSubstring(querySequence, i, maxLength);
				subseq.MakeRC(rc);
				int rclcpLength;
				int numForwardMatches;
				if (lcpLength == 0) {
					numForwardMatches = 0;
				}
				else {
					numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1];
				}
				rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																						rc.seq, maxLength,
																						true,
																						rclcpLength,
																						rclcpLeftBounds, rclcpRightBounds,
																						false);

				string rcstr((const char*)rc.seq, rc.length);

				if (rclcpLength < maxLength or 
						rclcpRightBounds.size() == 0 or
						(numForwardMatches + 
						 rclcpRightBounds[rclcpRightBounds.size() - 1] -
						 rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) 
					{
						char* substr = new char[maxLength+1];
						substr[maxLength] = '\0';
						memcpy(substr, &querySequence.seq[i], maxLength);

						//						string substr = string((const char*) querySequence.seq, i, maxLength);
						
						outputFile << querySequence.title << "\t" << substr << "\t" << i << endl;

						++nMatches;
						delete[] substr;
						//					}
					}
				rc.Free();
			}

		}
		summaryTable << querySequence.title << "\t" << nMatches << endl;
		querySequence.Free();
	}
	outputFile.close();
	genome.Free();
}
Exemplo n.º 3
0
int main(int argc, char* argv[]) {
	string genomeFileName;
	string suffixArrayFileName;
	if (argc < 4) {
		cout << "Usage: printWordCount genome suffixArray k [k2 k3 k4...]" << endl;
		exit(1);
	}
	genomeFileName = argv[1];
	suffixArrayFileName = argv[2];
	int argi = 3;
	vector<DNALength> k;
	while (argi < argc) {
		k.push_back(atoi(argv[argi]));
		argi++;
	}

	// Get the ref sequence.
	FASTAReader reader;
	reader.Init(genomeFileName);
	FASTASequence seq;
  //	reader.GetNext(seq);
  reader.ReadAllSequencesIntoOne(seq);
	seq.ToUpper();
	// Get the suffix array.
	DNASuffixArray sarray;
	sarray.Read(suffixArrayFileName);
	
	int ki;
  char *word;
  cout << "wordlen word nword" << endl;
	for (ki = 0; ki < k.size(); ki++) {
    word = new char[k[ki]+1];
    word[k[ki]] = '\0';
		DNALength i;
		DNALength numUnique = 0;
		for (i = 0; i < seq.length - k[ki] - 1; ) {
			DNALength j = i + 1;
      bool seqAtN = false;
      int si;
      for(si = 0; si < k[ki]; si++) {
        if (seq.seq[sarray.index[i] + si] == 'N') {
          seqAtN = true;
          break;
        }
      }
      if (seqAtN) {
        i++;
        continue;
      }
			while (j < seq.length - k[ki] and 
						 seq.length - sarray.index[i] >= k[ki] and
						 seq.length - sarray.index[j] >= k[ki] and 
						 strncmp((const char*) &seq.seq[sarray.index[i]], (const char*) &seq.seq[sarray.index[j]], k[ki]) == 0) {
				j++;
			}
      if (seq.length - sarray.index[i] >= k[ki]) {
        for(si = 0; si < k[ki]; si++) {
          word[si] = seq.seq[sarray.index[i]+si];
        }
        cout << k[ki] << " " << word << " " << j - i + 1 << endl;
        if (j == i + 1) { 
          ++numUnique;
        }
      }
			i = j;
		}
	}
}