示例#1
0
SeqType
read_fasta(const std::string& filename, const std::string& name)
{
  std::string line, begin_contig=">"+name;
  SeqType sequence;

  // open file
  std::ifstream fastafile ( filename.c_str() , std::ifstream::in );

  if (!fastafile.is_open()) {
    std::stringstream ss;

    ss<< "Fasta file "<< filename << " cannot be opened.";

    throw std::domain_error(ss.str().c_str());
  }

  // find contig in multifasta
  do {
    getline(fastafile, line);

    if (fastafile.eof()) {
      std::stringstream s;
      s << "Sequence of \"" << name << "\" not found.";
      throw std::invalid_argument(s.str());
    }

  } while (line!=begin_contig);


  /*
  // get first contig line
  getline(fastafile, line);

  // while we do not reach a new contig
  while ((!fastafile.eof())&&(line.substr(0,1) != ">")) {

    // move read bases into sequence
    unsigned int filled_until=sequence.size();
    sequence.resize(sequence.size()+line.size());
    for (unsigned int i=0; i<line.size(); i++) {
      sequence[i+filled_until]=line[i];
    }

    // read a new line
    getline(fastafile, line);
  }
  */

  sequence=read_sequence(fastafile);

  fastafile.close();
  return sequence;
}
示例#2
0
int main(int argc, char** argv)
{
	if (argc < 4)
	{
		std::cout << "Usage: get_TrieArray <fastafile> <matrix> <peptideLength> <outfile> " << std::endl;
		return -1;
	}

	string fastafile(argv[1]);
	string matrix(argv[2]);
	cout << "test" << endl;
	int peptideLength(atoi(argv[3]));
	string outname(argv[4]);

	
	//std::cout << fastafile << "\t" << outname << std::endl;
		
//----------------------------------------------------------------------------------------
	cout << "Reading FASTA file..." << endl;

	Sequences s(fastafile);
	cout << "Read " << s.size() << " sequences." << endl;

	cout << "Generating peptides..." << endl;
	Sequences ninemers;
	generateAllSubstrings(ninemers, s, peptideLength);
	cout << "Generated " << ninemers.size() << " peptides." << endl;

	s.clear();


  	//Matrix m("/abi-projects/dist2self/matrices/BLOSUM45_distance_normal.dat"); cout << "Initializing trie. " << endl; Trie t; 
  	Matrix m(matrix); cout << "Initializing trie. " << endl; Trie t;
	Matrix::IndexSequence indices; 
  	for (size_t i = 0; i < ninemers.size(); ++i) {
 
  	 m.translate(ninemers[i], indices); t.add(indices);
  	} 
    t.dump();


	cout << "Converting to trie array." << endl;
  	TrieArray ta(t, peptideLength);

	cout << "Done." << endl;

//	std::ofstream ofs("test.trie");
	std::ofstream ofs(outname.c_str());
	boost::archive::text_oarchive oa(ofs);
	ta.save(oa,1);


}
示例#3
0
int main(int argc, char** argv)
{
	if (argc < 2)
	{
		std::cout << "Usage: get_TrieArray <fastafile> <outfile> " << std::endl;
		return -1;
	}

	ofstream myfile;
	string fastafile(argv[1]);
	const char* outname(argv[2]);
	
	myfile.open(outname);

	std::cout << fastafile << "\t" << outname << std::endl;
		
//----------------------------------------------------------------------------------------
	cout << "Reading FASTA file..." << endl;

	Sequences s(fastafile);
	cout << "Read " << s.size() << " sequences." << endl;

	cout << "Generating ninemers..." << endl;
	Sequences ninemers;
	generateAllSubstrings(ninemers, s, 9);
	cout << "Generated " << ninemers.size() << " ninemers." << endl;

	s.clear();


  	for (size_t i = 0; i < ninemers.size(); ++i) {
  		myfile << "> " << i << endl;
  		myfile << ninemers[i] << endl; 
  	} 

  	myfile.close();

	cout << "Done." << endl;


}
示例#4
0
文件: fasta.cpp 项目: cmbi/kmad
fasta::FastaData fasta::parse_fasta(std::string const& filename,
                                    int codon_length)
{
  fs::path p(filename);
  if (!fs::exists(p)) {
    throw std::invalid_argument("File not found: " + filename);
  }

  std::ifstream fastafile(filename.c_str());
  std::string line;
  std::string header;
  std::string seq_line;
  auto in_sequence_section = true;
  fasta::FastaData fd;
  while (std::getline(fastafile, line)) {
    if (line.substr(0, 1) == ">") {
        if (!seq_line.empty()) {
            auto sequence = fasta::make_sequence(
                header, seq_line, codon_length
            );
            fd.sequences.push_back(sequence);
            seq_line = "";
        }
        in_sequence_section = true;

        // Parse header
        line.erase(std::remove(line.begin(), line.end(), '\r'), line.end());
        header = line;
        continue;
    }

    if (line.substr(0, 1) == "#") {
        assert(!seq_line.empty());

        auto sequence = fasta::make_sequence(header, seq_line, codon_length);
        fd.sequences.push_back(sequence);
        in_sequence_section = false;
        continue;
    }

    if (in_sequence_section) {
        line.erase(std::remove(line.begin(), line.end(), '\r'), line.end());
        seq_line += line;
    } else {
        std::vector<std::string> result;
        boost::split(result, line, boost::is_any_of("\t "));

        if (result.size() != 2) {
            throw std::runtime_error("Invalid probability format: " + line);
        }
        fd.probabilities["m_" + result[0]] = std::stod(result[1]);
    }
  }

  // If the file doesn't have a probability section, add the last sequence that
  // was being processed.
  if (in_sequence_section) {
      auto sequence = fasta::make_sequence(header, seq_line, codon_length);
      fd.sequences.push_back(sequence);
  }

  return fd;
}