C++ (Cpp) FASTAReader::ReadAllSequencesIntoOneの例

コード例 #1

0

ファイルを表示

ファイル: SAModify.cpp プロジェクト: BioinformaticsArchive/blasr

int main(int argc, char* argv[]) {

	if (argc < 4) {
		PrintUsage();
		exit(1);
	}
	int argi = 1;
	string saInFile = argv[argi++];
	string genomeFileName = argv[argi++];
	string saOutFile = argv[argi++];
	vector<string> inFiles;
	
	int doBLT = 0;
	int doBLCP = 0;
	int bltPrefixLength = 0;
	int lcpLength = 0;
	int parsingOptions = 0;
	
	while (argi < argc) {
		if (strcmp(argv[argi], "-blt") == 0) {
			doBLT = 1;
			bltPrefixLength = atoi(argv[++argi]);
		}
		else if (strcmp(argv[argi], "-blcp") == 0) {
			doBLCP = 1;
				lcpLength = atoi(argv[++argi]);
		}
		else {
			PrintUsage();
			cout << "Bad option: " << argv[argi] << endl;
			exit(1);
		}
		++argi;
	}

	//
	// Read the suffix array to modify.
	//

	DNASuffixArray  sa;
	sa.Read(saInFile);

	FASTAReader reader;
	reader.Initialize(genomeFileName);
	FASTASequence seq;
	reader.ReadAllSequencesIntoOne(seq);

	
	if (doBLT) {
		sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength);
	}

	if (doBLCP) {
		cout << "LCP Table not yet implemented." << endl;
	}

	sa.Write(saOutFile);

}

コード例 #2

0

ファイルを表示

ファイル: FindUnique.cpp プロジェクト: JinfengChen/chm1_scripts

int main(int argc, char* argv[1]) {
	if (argc < 3) {
		cout << "Usage: findUnique genome.fasta query.fasta effective_k [options]" << endl;
		cout << "  genome.fasta.sa must exist." << endl;
		cout << "  Finds sequences at least effective_k in length that are unique." << endl;
		cout << "  -max m       Allow up to m matches" << endl;
		cout << "  -minLength l Ensure the length of the match is at least this." << endl;
		cout << "  -prefix p n  Allow up to n matches across a prefix of length p" << endl;
		cout << "  -suffix s n  Allow up to n matches across a suffix of length s" << endl;
		cout << "               Prefix and suffix options override max." << endl;
		cout << "  -out file    Print queries to this output file (query.fasta.queries)" << endl;
		exit(0);
	}

	DNASuffixArray sarray;
	
	string genomeFileName = argv[1];
	string suffixArrayFileName = genomeFileName + ".sa";
	
	FASTAReader reader;
	FASTASequence genome;

	int maxN = 0;

	int prefix = 0;
	int suffix = 0;
	int prefixN = 0;
	int suffixN = 0;
	int argi = 4;
	string outputFileName = "";
	int minLength = 0;
	while (argi < argc) {
		if (strcmp(argv[argi], "-max") == 0) {
			++argi;
			maxN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-prefix") == 0) {
			++argi;
			prefix = atoi(argv[argi]);
			++argi;
			prefixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-suffix") == 0) {
			++argi;
			suffix = atoi(argv[argi]);
			++argi;
			suffixN = atoi(argv[argi]);
		}
		else if (strcmp(argv[argi], "-out") == 0) {
			++argi;
			outputFileName = argv[argi];
		}
		else if (strcmp(argv[argi], "-minLength") == 0) {
			++argi;
			minLength = atoi(argv[argi]);
		}
		++argi;
	}

	reader.Initialize(genomeFileName);
	reader.ReadAllSequencesIntoOne(genome);
	sarray.Read(suffixArrayFileName);

	FASTAReader queryReader;
	FASTASequence querySequence;
	string queryFileName = argv[2];
	int maxLength = atoi(argv[3]);
	string summaryTableFileName = queryFileName + ".summary";
	if (outputFileName == "") {
		outputFileName = queryFileName + ".queries";
	}
		
	
	ofstream summaryTable(summaryTableFileName.c_str());
	ofstream outputFile(outputFileName.c_str());

	queryReader.Initialize(queryFileName);

	while (queryReader.GetNext(querySequence)) {
		int i;
		cerr << "searching " << querySequence.title << endl;
		if (querySequence.length < maxLength) {
			continue;
		}

		int nMatches = 0;
		querySequence.ToUpper();
		int localMax;
		for (i = 0; i < querySequence.length - maxLength + 1; i++) {
			if ((i + 1) % 100000 == 0) {
				cerr << "processed: " << i + 1 << endl;
			}

			int lcpLength;
			vector<SAIndex> lcpLeftBounds, lcpRightBounds;
			vector<SAIndex> rclcpLeftBounds, rclcpRightBounds;
			localMax = maxN;
			if (i < prefix) {
				localMax = prefixN;
			}
			if (i >= querySequence.length - suffix) {
				localMax = suffixN;
			}
			if (querySequence.length - i <= maxLength) {
				continue;
			}
			if (querySequence.seq[i] == 'N') {
				continue;
			}
			lcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																				&querySequence.seq[i], querySequence.length-i,
																				true,
																				maxLength,
																				lcpLeftBounds, lcpRightBounds,
																				false);
			if (lcpLength < minLength) {
				continue;
			}
			if (lcpLength < maxLength or 
					lcpRightBounds.size() == 0 or 
					(lcpRightBounds.size() > 0 and 
					 lcpLeftBounds.size() > 0 and  
					 lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1] <= localMax)) {

				FASTASequence rc;
				DNASequence subseq;
				subseq.ReferenceSubstring(querySequence, i, maxLength);
				subseq.MakeRC(rc);
				int rclcpLength;
				int numForwardMatches;
				if (lcpLength == 0) {
					numForwardMatches = 0;
				}
				else {
					numForwardMatches = lcpRightBounds[lcpRightBounds.size() - 1] - lcpLeftBounds[lcpLeftBounds.size()-1];
				}
				rclcpLength = sarray.StoreLCPBounds(genome.seq, genome.length, // The string which the suffix array is built on.
																						rc.seq, maxLength,
																						true,
																						rclcpLength,
																						rclcpLeftBounds, rclcpRightBounds,
																						false);

				string rcstr((const char*)rc.seq, rc.length);

				if (rclcpLength < maxLength or 
						rclcpRightBounds.size() == 0 or
						(numForwardMatches + 
						 rclcpRightBounds[rclcpRightBounds.size() - 1] -
						 rclcpLeftBounds[rclcpLeftBounds.size()-1] <= localMax)) 
					{
						char* substr = new char[maxLength+1];
						substr[maxLength] = '\0';
						memcpy(substr, &querySequence.seq[i], maxLength);

						//						string substr = string((const char*) querySequence.seq, i, maxLength);
						
						outputFile << querySequence.title << "\t" << substr << "\t" << i << endl;

						++nMatches;
						delete[] substr;
						//					}
					}
				rc.Free();
			}

		}
		summaryTable << querySequence.title << "\t" << nMatches << endl;
		querySequence.Free();
	}
	outputFile.close();
	genome.Free();
}

コード例 #3

0

ファイルを表示

ファイル: NormalizeGCContent.cpp プロジェクト: EichlerLab/blasr

int main(int argc, char* argv[]) {


	string refFileName, notNormalFileName, normalFileName;

	if (argc < 4) {
		cout << "usage: normalizeGCContent ref source dest " << endl
				 << "       flips the C/Gs in source randomly until they are the same gc content as ref." << endl;
		exit(1);
	}
		
	refFileName = argv[1];
	notNormalFileName = argv[2];
	normalFileName = argv[3];


	FASTAReader reader;
	FASTAReader queryReader;
	FASTASequence ref;
	vector<FASTASequence> querySequences;
	int queryTotalLength;
	reader.Initialize(refFileName);
	reader.ReadAllSequencesIntoOne(ref);

	queryReader.Initialize(notNormalFileName);
	int refCounts[5], queryCounts[5];
	int s;
	refCounts[0] = refCounts[1] =refCounts[2] = refCounts[3] = refCounts[4] = 0;
	queryCounts[0] = queryCounts[1] =queryCounts[2] = queryCounts[3] = queryCounts[4] = 0;
	
	queryReader.ReadAllSequences(querySequences);
	ofstream normOut;
	CrucialOpen(normalFileName, normOut);

	CountNucs(ref, refCounts);
	
	float refGC = (1.0*refCounts[TwoBit['c']] + refCounts[TwoBit['g']]) / (refCounts[TwoBit['a']] + refCounts[TwoBit['c']] + refCounts[TwoBit['g']] + refCounts[TwoBit['t']]);

	int q;
	for (q = 0; q < querySequences.size(); q++) {
		CountNucs(querySequences[q], queryCounts);
	}

	float queryGC = (1.0*queryCounts[TwoBit['c']] + queryCounts[TwoBit['g']]) / (queryCounts[TwoBit['a']] + queryCounts[TwoBit['c']] + queryCounts[TwoBit['g']] + queryCounts[TwoBit['t']]);

	
	float gcToat = 0.0;
	float atTogc = 0.0;
	if (refGC > queryGC) {
		atTogc = (refGC - queryGC);
	}
	else {
		gcToat = (queryGC - refGC);
	}

	
	DNALength queryGenomeLength = queryCounts[0] +  queryCounts[1] + queryCounts[2] + queryCounts[3] + queryCounts[4];

	DNALength unmaskedQueryLength = queryCounts[0] +  queryCounts[1] + queryCounts[2] + queryCounts[3];

	DNALength ngc2at = unmaskedQueryLength * gcToat;
	DNALength nat2gc = unmaskedQueryLength * atTogc;
	cout << refGC << " " << queryGC << " " << gcToat << " " << atTogc << " " << ngc2at << " " << nat2gc << endl;

	vector<FASTASequence> normalized;

	normalized.resize(querySequences.size());
	vector<DNALength> cumLengths;
	
	cumLengths.resize(normalized.size()+1);
	cumLengths[0] = 0;
	for (q = 0; q < querySequences.size(); q++) {
		normalized[q]   = querySequences[q];
		cumLengths[q+1] = cumLengths[q] + querySequences[q].length;
	}
	
	DNALength i;

																
	for (i = 0; i < ngc2at; i+=2) {
		DNALength pos, chr;
		FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'G', chr, pos);
		normalized[chr].seq[pos] = 'A';
		FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'C', chr, pos);
		normalized[chr].seq[pos] = 'T';		
	}
	
	for (i = 0; i < nat2gc; i+=2) {
		DNALength pos, chr;
		FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'A', chr, pos);
		normalized[chr].seq[pos] = 'g';
		FindRandomNuc(normalized, queryGenomeLength, cumLengths, 'T', chr, pos);
		normalized[chr].seq[pos] = 'c';		
	}

	for (q = 0; q < normalized.size(); q++ ){
		normalized[q].PrintSeq(normOut);
	}

}

コード例 #4

0

ファイルを表示

ファイル: SAWriter.cpp プロジェクト: bnbowman/blasr

int main(int argc, char* argv[]) {

	if (argc < 2) {
		PrintUsage();
		exit(1);
	}
	int argi = 1;
	string saFile = argv[argi++];
	vector<string> inFiles;
	
	int doBLT = 1;
	int bltPrefixLength = 8;
	int parsingOptions = 0;
	SAType saBuildType = larsson;
	int read4BitCompressed  = 0;
	int diffCoverSize = 0;
	while (argi < argc) {
		if (strlen(argv[argi]) > 0 and
				argv[argi][0] == '-'){ 
			parsingOptions = 1;
		}
		if (!parsingOptions) {
			inFiles.push_back(argv[argi]);
		}
		else {
			if (strcmp(argv[argi], "-blt") == 0) {
				doBLT = 1;
        if (argi < argc - 1) {
          bltPrefixLength = atoi(argv[++argi]);
          if (bltPrefixLength == 0) {
            cout << argv[argi] << " is not a valid lookup table length." << endl;
            exit(1);
          }
        }
        else {
          cout << "Please specify a lookup table length." << endl;
          exit(1);
        }
			}
			else if (strcmp(argv[argi], "-mamy") == 0) {
				saBuildType = manmy;
			}
			else if (strcmp(argv[argi], "-larsson") == 0) {
				saBuildType = larsson;
			}
			else if (strcmp(argv[argi], "-mcilroy") == 0) {
				saBuildType = mcilroy;
			}
			else if (strcmp(argv[argi], "-slow") == 0) {
				saBuildType = slow;
			}
			else if (strcmp(argv[argi], "-kark") == 0) {
				saBuildType = kark;
			}
			else if (strcmp(argv[argi], "-mafe") == 0) {
				saBuildType = mafe;
			}
			else if (strcmp(argv[argi], "-welter") == 0) {
				saBuildType = welter;
			}
			else if (strcmp(argv[argi], "-welterweight") == 0) {
        if (argi < argc-1) {
          diffCoverSize = atoi(argv[++argi]);
        }
        else {
          cout << "Please specify a difference cover size.  Valid values are 7,32,64,111, and 2281.  Larger values use less memory but may be slower." << endl;
          exit(1);
        }
        if ( ! (diffCoverSize == 7 or 
                diffCoverSize == 32 or
                diffCoverSize == 64 or 
                diffCoverSize == 111 or
                diffCoverSize == 2281) ) {
          cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl;
          cout << "Larger numbers use less space but are more slow." << endl;
          exit(1);
        }
			}
			else if (strcmp(argv[argi], "-4bit") == 0) {
				read4BitCompressed = 1;
			}
			else {
				PrintUsage();
				cout << "ERROR, bad option: " << argv[argi] << endl;
				exit(1);
			}
		}
		++argi;
	}
  
  if (inFiles.size() == 0) {
    //
    // Special use case: the input file is a fasta file.  Write to that file + .sa
    //
    inFiles.push_back(saFile);
    saFile = saFile + ".sa";
  }
  
	VectorIndex inFileIndex;
	FASTASequence seq;
	CompressedSequence<FASTASequence> compSeq;

	if (read4BitCompressed == 0) {
		for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) {
			FASTAReader reader;
			reader.Init(inFiles[inFileIndex]);
			reader.SetSpacePadding(111);
			if (saBuildType == kark) {
				//
				// The Karkkainen sa building method requires a little extra
				// space at the end of the dna sequence so that counting may
				// be done mod 3 without adding extra logic for boundaries.
				//
			}
  
			if (inFileIndex == 0) {
				reader.ReadAllSequencesIntoOne(seq);
				reader.Close();
			}
			else {
				while(reader.ConcatenateNext(seq)) {
					cout << "added " << seq.title << endl;
				}
			}
		}
		seq.ToThreeBit();
		//seq.ToUpper();
	}
	else {
		assert(inFiles.size() == 1);
		cout << "reading compressed sequence." << endl;
		compSeq.Read(inFiles[0]);
		seq.seq = compSeq.seq;
		seq.length = compSeq.length;
		compSeq.RemoveCompressionCounts();
		cout << "done." << endl;
	}

  //
  // For now, do not allow creation of suffix arrays on sequences > 4G.
  //
  if (seq.length >= UINT_MAX) {
    cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl;
    cout << "Consider breaking the reference into multiple files, running alignment. " << endl;
    cout << "against each file, and merging the result." << endl;
    exit(1);
  }
	vector<int> alphabet;
	
	SuffixArray<Nucleotide, vector<int> >  sa;
	//	sa.InitTwoBitDNAAlphabet(alphabet);
	//	sa.InitAsciiCharDNAAlphabet(alphabet);
  sa.InitThreeBitDNAAlphabet(alphabet);

	if (saBuildType == manmy) {
		sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet);
	}
	else if (saBuildType == mcilroy) {
		sa.index = new SAIndex[seq.length+1];
		DNALength i;
		for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;}
		sa.index[seq.length] = 0;
		ssort(sa.index, NULL);
		for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];};
		sa.length = seq.length;
	}
	else if (saBuildType == larsson) {
		sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet);
	}
	else if (saBuildType == kark) {
		sa.index = new SAIndex[seq.length];
		seq.ToThreeBit();
		DNALength p;
		for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; }
		KarkkainenBuildSuffixArray<Nucleotide>(seq.seq, sa.index, seq.length, 5);
		sa.length = seq.length;
	}
	else if (saBuildType == mafe) {
		//		sa.MaFeBuildSuffixArray(seq.seq, seq.length);
		
	}
	else if (saBuildType == welter) {
		if (diffCoverSize == 0) {
			sa.LightweightBuildSuffixArray(seq.seq, seq.length);
		}
		else {
			sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize);
		}
	}
	if (doBLT) {
		sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength);
	}
	sa.Write(saFile);

	return 0;

}

コード例 #5

0

ファイルを表示

ファイル: PrintWordCount.cpp プロジェクト: EichlerLab/blasr

int main(int argc, char* argv[]) {
	string genomeFileName;
	string suffixArrayFileName;
	if (argc < 4) {
		cout << "Usage: printWordCount genome suffixArray k [k2 k3 k4...]" << endl;
		exit(1);
	}
	genomeFileName = argv[1];
	suffixArrayFileName = argv[2];
	int argi = 3;
	vector<DNALength> k;
	while (argi < argc) {
		k.push_back(atoi(argv[argi]));
		argi++;
	}

	// Get the ref sequence.
	FASTAReader reader;
	reader.Init(genomeFileName);
	FASTASequence seq;
  //	reader.GetNext(seq);
  reader.ReadAllSequencesIntoOne(seq);
	seq.ToUpper();
	// Get the suffix array.
	DNASuffixArray sarray;
	sarray.Read(suffixArrayFileName);
	
	int ki;
  char *word;
  cout << "wordlen word nword" << endl;
	for (ki = 0; ki < k.size(); ki++) {
    word = new char[k[ki]+1];
    word[k[ki]] = '\0';
		DNALength i;
		DNALength numUnique = 0;
		for (i = 0; i < seq.length - k[ki] - 1; ) {
			DNALength j = i + 1;
      bool seqAtN = false;
      int si;
      for(si = 0; si < k[ki]; si++) {
        if (seq.seq[sarray.index[i] + si] == 'N') {
          seqAtN = true;
          break;
        }
      }
      if (seqAtN) {
        i++;
        continue;
      }
			while (j < seq.length - k[ki] and 
						 seq.length - sarray.index[i] >= k[ki] and
						 seq.length - sarray.index[j] >= k[ki] and 
						 strncmp((const char*) &seq.seq[sarray.index[i]], (const char*) &seq.seq[sarray.index[j]], k[ki]) == 0) {
				j++;
			}
      if (seq.length - sarray.index[i] >= k[ki]) {
        for(si = 0; si < k[ki]; si++) {
          word[si] = seq.seq[sarray.index[i]+si];
        }
        cout << k[ki] << " " << word << " " << j - i + 1 << endl;
        if (j == i + 1) { 
          ++numUnique;
        }
      }
			i = j;
		}
	}
}