void AddGenesFromFile(string filename) {
        SingleFastaReader fasta_reader(filename);
            auto reads = fasta_reader.Read();

            for(auto read = reads.begin(); read != reads.end(); read++)
                ig_genes_.push_back(IgGene(read->name, read->seq));
    }
void populate_kmer_counter(KmerCounter &kcounter, string &kmers_fasta_file)
{
    // code largely copied from IRKE.cpp
    int i, myTid;
    unsigned long sum,
        *record_counter = new unsigned long[omp_get_max_threads()];
    unsigned long start, end;
    // init record counter
    for (int i = 0; i < omp_get_max_threads(); i++) {
        record_counter[i] = 0;
    }
    cerr << "-reading Kmer occurences..." << endl;
    start = time(NULL);
    Fasta_reader fasta_reader(kmers_fasta_file);
#pragma omp parallel private (myTid)
    {
        myTid = omp_get_thread_num();
        record_counter[myTid] = 0;
        while (true) {
            Fasta_entry fe = fasta_reader.getNext();
            if (fe.get_sequence() == "") break;
            record_counter[myTid]++;
            if (IRKE_COMMON::MONITOR) {
                if (myTid == 0 && record_counter[myTid] % 100000 == 0) {
                    sum = record_counter[0];
                    for (i = 1; i < omp_get_num_threads(); i++)
                        sum += record_counter[i];
                    cerr << "\r [" << sum / 1000000 << "M] Kmers parsed.     ";
                }
            }
            string seq = fe.get_sequence();
            if (seq.length() != KMER_SIZE) {
                cerr << "ERROR: kmer " << seq << " is not of length: " << KMER_SIZE << endl;
                continue;
            }
            kmer_int_type_t kmer = kcounter.get_kmer_intval(seq);
            unsigned int count = atoi(fe.get_header().c_str());
            kcounter.add_kmer(kmer, count);
        }
    }
    end = time(NULL);
    sum = record_counter[0];
    for (i = 1; i < omp_get_max_threads(); i++)
        sum += record_counter[i];
    delete[] record_counter;
    cerr << endl << " done parsing " << sum << " Kmers, " << kcounter.size() << " added, taking " << (end - start)
        << " seconds." << endl;
    return;
}
Exemplo n.º 3
0
int ReadFastaQueries(const string& filename,
                      vector< CRef<objects::CSeq_loc> >& seqs,
                      CRef<objects::CScope>& scope,
                      bool parse_deflines /* = false*/,
                      objects::CSeqIdGenerator* id_generator /* = NULL*/)
{
    seqs.clear();
    CNcbiIfstream instream(filename.c_str());
    if (!instream) {
        return -1;
    }

    CStreamLineReader line_reader(instream);
    CFastaReader::TFlags flags =  CFastaReader::fAssumeProt |
        CFastaReader::fForceType;
    
    if (!parse_deflines) {
        flags |= CFastaReader::fNoParseID;
    }

    CFastaReader fasta_reader(line_reader, flags);

    if (id_generator) {
        fasta_reader.SetIDGenerator(*id_generator);
    }

    scope->AddDefaults();
    while (!line_reader.AtEOF()) {

        CRef<CSeq_entry> entry = fasta_reader.ReadOneSeq();

        if (entry == 0) {
            return -1;
        }
        scope->AddTopLevelSeqEntry(*entry);
        CTypeConstIterator<CBioseq> itr(ConstBegin(*entry));
        CRef<CSeq_loc> seqloc(new CSeq_loc());
        seqloc->SetWhole().Assign(*itr->GetId().front());
        seqs.push_back(seqloc);
    }

    return 0;
}
Exemplo n.º 4
0
int ReadMsa(const string& filename, CRef<CSeq_align>& align,
            CRef<CScope> scope, bool parse_deflines /* = false*/,
            objects::CSeqIdGenerator* id_generator /* = NULL*/)
{
    if (scope.Empty()) {
        return -1;
    }

    CNcbiIfstream instream(filename.c_str());
    if (!instream) {
        return -1;
    }
    CStreamLineReader line_reader(instream);

    CFastaReader::TFlags flags =  CFastaReader::fAssumeProt |
        CFastaReader::fForceType | CFastaReader::fValidate;
    
    if (!parse_deflines) {
        flags |= CFastaReader::fNoParseID;
    }

    CFastaReader fasta_reader(line_reader, flags);

    if (id_generator) {
        fasta_reader.SetIDGenerator(*id_generator);
    }

    CRef<CSeq_entry> entry = fasta_reader.ReadAlignedSet(-1);
    if (entry.Empty()) {
        return -1;
    }
    scope->AddTopLevelSeqEntry(*entry);

    // notify of a problem if the whole file was not read
    if (!line_reader.AtEOF()) {
        return -1;
    }

    align = entry->GetAnnot().front()->GetData().GetAlign().front();

    return 0;
}
Exemplo n.º 5
0
void IRKE::populate_Kmers_from_kmers(const string& fasta_filename) {
	unsigned int kmer_length = kcounter.get_kmer_length();
	int i, myTid;
	unsigned long sum, 
                  *record_counter = new unsigned long[omp_get_max_threads()];
	unsigned long start, end;

  // init record counter
  for (int i = 0; i < omp_get_max_threads(); i++) {
  	record_counter[i] = 0;
  }

	cerr << "-reading Kmer occurences..." << endl;
	start = time(NULL);

	Fasta_reader fasta_reader(fasta_filename);

  #pragma omp parallel private (myTid)
	{
		myTid = omp_get_thread_num();
		record_counter[myTid] = 0;

		while (true) {
			Fasta_entry fe = fasta_reader.getNext();
			if (fe.get_sequence() == "") break;

            record_counter[myTid]++;

			if (IRKE_COMMON::MONITOR) {
				if (myTid == 0 && record_counter[myTid] % 100000 == 0)
					{
						sum = record_counter[0];
						for (i=1; i<omp_get_num_threads(); i++)
							sum+= record_counter[i];
						cerr << "\r [" << sum/1000000 << "M] Kmers parsed.     ";
                    }
			}


			string seq = fe.get_sequence();
			if (seq.length() != kmer_length) {
				continue;
			}

			kmer_int_type_t kmer = kcounter.get_kmer_intval(seq);
			unsigned int count = atoi(fe.get_header().c_str());
			kcounter.add_kmer(kmer, count);
		}
	}
	end = time(NULL);

	sum = record_counter[0];
	for (i=1; i<omp_get_max_threads(); i++)
		sum+= record_counter[i];
    delete [] record_counter;

	cerr << endl << " done parsing " << sum << " Kmers, " << kcounter.size() << " added, taking " << (end-start) << " seconds." << endl;

    ofstream iworm_kmer_count_report_fh;
    iworm_kmer_count_report_fh.open("inchworm.kmer_count");
    iworm_kmer_count_report_fh << kcounter.size() << endl;
    iworm_kmer_count_report_fh.close();
    

	return;
}
Exemplo n.º 6
0
void IRKE::populate_Kmers_from_fasta(const string& fasta_filename, bool reassembleIworm) {
	
	unsigned int kmer_length = kcounter.get_kmer_length();
	int i, myTid;
	unsigned long sum, 
                  *record_counter = new unsigned long[omp_get_max_threads()];
	unsigned long start, end;
    
    // init record counter
    for (int i = 0; i < omp_get_max_threads(); i++) {
        record_counter[i] = 0;
    }

	
	cerr << "-storing Kmers..." << endl;
	start = time(NULL);
	
	Fasta_reader fasta_reader(fasta_filename);

    unsigned int entry_num = 0;

    #pragma omp parallel private (myTid)
	{
		myTid = omp_get_thread_num();
		record_counter[myTid] = 0;
		
		while (fasta_reader.hasNext()) {
			Fasta_entry fe = fasta_reader.getNext();
            string accession = fe.get_accession();

            #pragma omp atomic            
            entry_num++;
            record_counter[myTid]++;
			
			if (IRKE_COMMON::MONITOR >= 4) {
				cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << endl;;
			}
			else if (IRKE_COMMON::MONITOR) {
				if (myTid == 0 && record_counter[myTid] % 1000 == 0)
					{
						sum = record_counter[0];
						for (i=1; i<omp_get_num_threads(); i++)
							sum+= record_counter[i];
						cerr << "\r [" << sum << "] sequences parsed.     ";
					}
			}
			
			
			string seq = fe.get_sequence();
			
			if (seq.length() < kmer_length + 1) {
				continue;
			}
			
			if (reassembleIworm) {
				string accession = fe.get_accession();
				string header = fe.get_header();
				// get coverage value from iworm assembly
				vector<string> tokens;
				string_util::tokenize(accession, tokens, ";");
				if (tokens.size() < 2) {
					stringstream err;
					err << "Could not extract coverage value from accession: " << tokens[tokens.size()-1];
					throw(err.str());
				}
				string cov_s = tokens[tokens.size()-1];
				unsigned int cov_val = atoi(cov_s.c_str());
				
				// get Kmer value from header
				vector<string> header_toks;
				string_util::tokenize(header, header_toks, " ");
				if (header_toks.size() < 5) {
					stringstream err;
					err << "Fasta header: " << header << " lacks expected format including Kmer length from previous inchworm assembly run";
					throw(err.str());
				}
				
				unsigned int kmer_val = atoi(header_toks[2].c_str());
				
				unsigned int normalized_coverage_val = static_cast<unsigned int> (cov_val * kmer_val / 25.0 + 0.5);
				
				if (IRKE_COMMON::MONITOR >= 1) {
					cerr << "Adding inchworm assembly " << accession 
						 << " K: " << kmer_val << " Cov: " << cov_val 
						 << " with coverage: " << normalized_coverage_val << endl;
				}
				if (cov_val < 1) {
					stringstream err;
					err << "error parsing coverage value from accession: " << accession;
					throw(err.str());
				}
				kcounter.add_sequence(seq, normalized_coverage_val);
			}
			else {
				kcounter.add_sequence(seq);
			}
			
			// remove singleton kmers at read interval to minimize memory requirements.
			if (PRUNE_SINGLETON_READ_INTERVAL > 0 
				&& 
				myTid == 0
				&&
				record_counter[myTid]/omp_get_num_threads() % PRUNE_SINGLETON_READ_INTERVAL == 0) {
				if (IRKE_COMMON::MONITOR >= 1) {
					cerr << "Reached singleton kmer pruning interval at read count: " << record_counter << endl;
				}
				prune_kmers_min_count(1);
			}
			
			
		}
	}
	end = time(NULL);
	
	sum = record_counter[0];
	for (i=1; i<omp_get_max_threads(); i++)
		sum+= record_counter[i];
    delete [] record_counter;
	
	cerr << endl << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << endl;
	
	
	return;
}
Exemplo n.º 7
0
int runMe(int argc, char* argv[]) {

    ArgProcessor args(argc, argv);
    if(args.isArgSet("--help") ||
       (!(args.isArgSet("--reads")
          &&
          ( args.isArgSet("--kmers") || args.isArgSet("--kmers_from_reads") )
          )) ) {
        cerr << usage(args) << endl << endl;
        exit(1);
    }

    string reads_fasta_file = args.getStringVal("--reads");
    
    bool is_DS = (! args.isArgSet("--SS"));
    if(args.isArgSet("--kmer_size")) {
        KMER_SIZE = args.getIntVal("--kmer_size");
        if(KMER_SIZE < 20) {
            cerr << "Error, min kmer size is 20";
            exit(2);
        }
    }
    if(args.isArgSet("--monitor")) {
        IRKE_COMMON::MONITOR = args.getIntVal("--monitor");
    }
    if (args.isArgSet("--num_threads")) {
        int num_threads = args.getIntVal("--num_threads");
        if (num_threads < MAX_THREADS) {
            omp_set_num_threads(num_threads);
        }
        else {
            // set to max
            omp_set_num_threads(MAX_THREADS);
        }
    }
    
    if(omp_get_max_threads() > MAX_THREADS) {
        omp_set_num_threads(MAX_THREADS);
    }
    KmerCounter kcounter (KMER_SIZE, is_DS);

    if (args.isArgSet("--kmers")) {
        string kmers_fasta_file = args.getStringVal("--kmers");
        populate_kmer_counter_from_kmers(kcounter, kmers_fasta_file);
    }
    else {
        string kmer_read_source_fasta_file = args.getStringVal("--kmers_from_reads");
        populate_kmer_counter_from_reads(kcounter, kmer_read_source_fasta_file);
    }
    
    Fasta_reader fasta_reader(reads_fasta_file);
    bool write_coverage_info = args.isArgSet("--capture_coverage_info");
    
    int start_time = time(NULL);

    #pragma omp parallel
    while (true) {

        if (! fasta_reader.hasNext())
            break;
        
        int myTid = omp_get_thread_num();
        
        Fasta_entry fe = fasta_reader.getNext();
        string sequence = fe.get_sequence();
        if(sequence == "")
            continue;

        string header = fe.get_header();
        vector<unsigned int> kmer_coverage = compute_kmer_coverage(sequence, kcounter);
        unsigned int median_cov = median_coverage(kmer_coverage);
        float mean_cov = mean(kmer_coverage);
        float stdev = stDev(kmer_coverage);
        float pct_stdev_of_avg = stdev/mean_cov*100;
        stringstream stats_text;
                
        stats_text << median_cov << "\t"
                   << mean_cov << "\t"
                   << stdev << "\t"
                   << pct_stdev_of_avg << "\t"
                   << fe.get_accession();

        stats_text << "\tthread:" << myTid;
        
        if(write_coverage_info) {
            // add the coverage info
            stats_text << "\t";
            for (size_t i = 0; i < kmer_coverage.size(); i++) {
                stats_text<< kmer_coverage[i];
                if(i != kmer_coverage.size() - 1) {
                    stats_text<< ",";
                }
            }
        }
        stats_text << endl;
        
        #pragma omp critical 
        {
            cout << stats_text.str();
        }
        
        if (mean_cov < 0) {
            cerr << "ERROR, cannot have negative coverage!!" << endl;
            exit(1);
        }
        
    }

    int end_time = time(NULL);

    cerr << "STATS_GENERATION_TIME: " << (end_time - start_time) << " seconds." << endl;
    
    return(0);
}
Exemplo n.º 8
0
void populate_kmer_counter_from_reads (KmerCounter& kcounter, string& fasta_filename) {
    unsigned int kmer_length = kcounter.get_kmer_length();
    int i, myTid;
    unsigned long sum,
        *record_counter = new unsigned long[omp_get_max_threads()];
    unsigned long start, end;

    // init record counter
    for (int i = 0; i < omp_get_max_threads(); i++) {
        record_counter[i] = 0;
    }


    cerr << "-storing Kmers..." << "\n";
    start = time(NULL);

    Fasta_reader fasta_reader(fasta_filename);

    unsigned int entry_num = 0;

#pragma omp parallel private (myTid)
    {
        myTid = omp_get_thread_num();
        record_counter[myTid] = 0;

        while (fasta_reader.hasNext()) {
            Fasta_entry fe = fasta_reader.getNext();
            string accession = fe.get_accession();

#pragma omp atomic
            entry_num++;
            record_counter[myTid]++;
            
            if (IRKE_COMMON::MONITOR >= 4) {
                cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << "\n";;
            }
            else if (IRKE_COMMON::MONITOR) {
                if (myTid == 0 && record_counter[myTid] % 1000 == 0)
                    {
                        sum = record_counter[0];
                        for (i=1; i<omp_get_num_threads(); i++)
                            sum+= record_counter[i];
                        cerr << "\r [" << sum << "] sequences parsed.     ";
                    }
            }
            
            string seq = fe.get_sequence();

            if (seq.length() < KMER_SIZE + 1) {
                continue;
            }
            kcounter.add_sequence(seq);

        }
        
        cerr << "\n" << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << "\n";
        
        
    }


    return;
    
}
int main(int argc, char* argv[]) {
    ArgProcessor args(argc, argv);
    if(args.isArgSet("--help") ||
       (!(args.isArgSet("--reads") && args.isArgSet("--kmers")))) {
        cerr << usage(args) << endl << endl;
        exit(1);
    }
    string reads_fasta_file = args.getStringVal("--reads");
    string kmers_fasta_file = args.getStringVal("--kmers");
    bool is_DS = (! args.isArgSet("--SS"));
    if(args.isArgSet("--kmer_size")) {
        KMER_SIZE = args.getIntVal("--kmer_size");
        if(KMER_SIZE < 20) {
            cerr << "Error, min kmer size is 20";
            exit(2);
        }
    }
    if(args.isArgSet("--monitor")) {
        IRKE_COMMON::MONITOR = args.getIntVal("--monitor");
    }
    if(omp_get_max_threads() > MAX_THREADS) {
        omp_set_num_threads(MAX_THREADS);
    }
    KmerCounter kcounter (KMER_SIZE, is_DS);
    populate_kmer_counter(kcounter, kmers_fasta_file);
    Fasta_reader fasta_reader(reads_fasta_file);
    ofstream* filewriter = NULL;
    ofstream* covwriter = NULL;
    bool write_coverage_info = args.isArgSet("--capture_coverage_info");
    while (true) {
        Fasta_entry fe = fasta_reader.getNext();
        string sequence = fe.get_sequence();
        if(sequence == "") break;
        string header = fe.get_header();
        vector<unsigned int> kmer_coverage = compute_kmer_coverage(sequence, kcounter);
        unsigned int median_cov = median_coverage(kmer_coverage);
        float mean_cov = mean(kmer_coverage);
        float stdev = stDev(kmer_coverage);
        float pct_stdev_of_avg = stdev/mean_cov*100;
        stringstream stats_text;
        stats_text << median_cov << "\t"
                   << mean_cov << "\t"
                   << stdev << "\t"
                   << pct_stdev_of_avg << "\t"
                   << fe.get_accession();
        if(write_coverage_info) {
            // add the coverage info
            stats_text << "\t";
            for (int i = 0; i < kmer_coverage.size(); i++) {
                stats_text<< kmer_coverage[i];
                if(i != kmer_coverage.size() - 1) {
                    stats_text<< ",";
                }
            }
        }
        stats_text << endl;
        cout << stats_text.str();

        if (mean_cov < 0) {
            cerr << "ERROR, cannot have negative coverage!!" << endl;
            exit(1);
        }
        
    }
    return(0);
}
Exemplo n.º 10
0
int run (int argc, char* argv[]) {
    
    if (argc < 3) {
        stringstream s;
        s << "Usage: " << argv[0] << " file.fasta kmer_length [DS_mode]" << endl << endl;
        
        cerr << s.str();
        return(1);
        
    }

    string fasta_filename (argv[1]);
    unsigned int kmer_length = atoi(argv[2]);
    
    bool DS_mode = (argc >= 3) ? true : false;
    
    Fasta_reader fasta_reader(fasta_filename);
    
    Ktree ktree;

    long read_counter = 0;
    
    while (fasta_reader.hasNext()) {
        
        read_counter++;
        if (read_counter % 1000 == 0) {
            cerr << "\rread[" << read_counter << "]   ";
        }
        

        Fasta_entry fe = fasta_reader.getNext();
        
        string accession = fe.get_accession();
        
        
        string sequence = fe.get_sequence();
        
        // cerr << "Processing: " << sequence << endl;
                        
        if (sequence.length() < kmer_length + 1) {
            continue;
        }
        
        for (unsigned int i = 0; i <= sequence.length() - kmer_length; i++) {
            
            string kmer = sequence.substr(i, kmer_length);
            
            if (! contains_non_gatc(kmer)) {

                ktree.add_kmer(kmer);
            
                if (DS_mode) {
                    kmer = revcomp(kmer);
                    ktree.add_kmer(kmer);
                }

            }
            
        }
        
    }
 

    ktree.report_kmer_counts();
    
   
    return(0);
}