Beispiel #1
0
bool IRKE::sequence_path_exists(string &sequence, unsigned int min_coverage, float min_entropy, float min_connectivity,
                                vector<unsigned int> &coverage_counter)
{

    unsigned int kmer_length = kcounter.get_kmer_length();

    if (sequence.length() < kmer_length) {
        return (false);
    }

    bool path_exists = true;

    string prev_kmer = sequence.substr(0, kmer_length);
    if (contains_non_gatc(prev_kmer) || !kcounter.kmer_exists(prev_kmer)) {
        path_exists = false;
        coverage_counter.push_back(0);
    }
    else {
        unsigned int kmer_count = kcounter.get_kmer_count(prev_kmer);
        coverage_counter.push_back(kmer_count);

        float entropy = compute_entropy(prev_kmer);

        if (kmer_count < min_coverage || entropy < min_entropy) {
            path_exists = false;
        }
    }


    for (unsigned int i = 1; i <= sequence.length() - kmer_length; i++) {

        string kmer = sequence.substr(i, kmer_length);

        if (contains_non_gatc(kmer) || !kcounter.kmer_exists(kmer)) {
            path_exists = false;
            coverage_counter.push_back(0);
        }
        else {
            unsigned int kmer_count = kcounter.get_kmer_count(kmer);
            coverage_counter.push_back(kmer_count);

            float entropy = compute_entropy(kmer);

            if (kmer_count < min_coverage || entropy < min_entropy) {
                path_exists = false;
            }
        }


        if (path_exists && !exceeds_min_connectivity(kcounter, prev_kmer, kmer, min_connectivity)) {
            path_exists = false;
        }

        prev_kmer = kmer;

    }

    return (path_exists);
}
vector<unsigned int> compute_kmer_coverage(string& sequence, KmerCounter& kcounter) {
    vector<unsigned int> coverage;
    if(IRKE_COMMON::MONITOR) {
        cerr << "processing sequence: " << sequence << endl;
    }
    for (int i = 0; i <= (int) sequence.length() - KMER_SIZE; i++) {
        // cerr << "i: " << i << ", <= " << sequence.length() - KMER_SIZE << endl;
        string kmer = sequence.substr(i, KMER_SIZE);
        if(IRKE_COMMON::MONITOR >= 2) {
            for (int j = 0; j <= i; j++) {
                cerr << " ";
            }
            cerr << kmer << endl;
        }
        unsigned int kmer_count = 0;
        if(!contains_non_gatc(kmer)) {
            kmer_count = kcounter.get_kmer_count(kmer);
        }

        // Note, in the jellyfish run, we restrain it to min kmer coverage of 2. 
        // If we don't find a kmer catalogued, it must have a kmer count of 1.
        if (kmer_count < 1) {
            kmer_count = 1;
        }
        
        coverage.push_back(kmer_count);
    }
    return(coverage);
}
Beispiel #3
0
vector<unsigned int> compute_kmer_coverage(string& sequence, KmerCounter& kcounter) {
    if(IRKE_COMMON::MONITOR) {
        cerr << "processing sequence: " << sequence << endl;
    }
    if (sequence.length() < KMER_SIZE)
    {
        // Can't rely on length() - KMER_SIZE for this as length is unsigned
        cerr << "Sequence: " << sequence << "is smaller than " << KMER_SIZE << " base pairs, skipping" << endl;
	return vector<unsigned int>();
    }

    vector<unsigned int> coverage;
    for (size_t i = 0; i <= sequence.length() - KMER_SIZE; i++) {
        // cerr << "i: " << i << ", <= " << sequence.length() - KMER_SIZE << endl;
        string kmer = sequence.substr(i, KMER_SIZE);
        if(IRKE_COMMON::MONITOR >= 2) {
            for (size_t j = 0; j <= i; j++) {
                cerr << " ";
            }
            cerr << kmer << endl;
        }
        unsigned int kmer_count = 0;
        if(!contains_non_gatc(kmer)) {
            kmer_count = kcounter.get_kmer_count(kmer);
        }

        // Note, in the jellyfish run, we restrain it to min kmer coverage of 2. 
        // If we don't find a kmer catalogued, it must have a kmer count of 1.
        if (kmer_count < 1) {
            kmer_count = 1;
        }
        
        coverage.push_back(kmer_count);
    }
    return(coverage);
}
Beispiel #4
0
int run (int argc, char* argv[]) {
    
    if (argc < 3) {
        stringstream s;
        s << "Usage: " << argv[0] << " file.fasta kmer_length [DS_mode]" << endl << endl;
        
        cerr << s.str();
        return(1);
        
    }

    string fasta_filename (argv[1]);
    unsigned int kmer_length = atoi(argv[2]);
    
    bool DS_mode = (argc >= 3) ? true : false;
    
    Fasta_reader fasta_reader(fasta_filename);
    
    Ktree ktree;

    long read_counter = 0;
    
    while (fasta_reader.hasNext()) {
        
        read_counter++;
        if (read_counter % 1000 == 0) {
            cerr << "\rread[" << read_counter << "]   ";
        }
        

        Fasta_entry fe = fasta_reader.getNext();
        
        string accession = fe.get_accession();
        
        
        string sequence = fe.get_sequence();
        
        // cerr << "Processing: " << sequence << endl;
                        
        if (sequence.length() < kmer_length + 1) {
            continue;
        }
        
        for (unsigned int i = 0; i <= sequence.length() - kmer_length; i++) {
            
            string kmer = sequence.substr(i, kmer_length);
            
            if (! contains_non_gatc(kmer)) {

                ktree.add_kmer(kmer);
            
                if (DS_mode) {
                    kmer = revcomp(kmer);
                    ktree.add_kmer(kmer);
                }

            }
            
        }
        
    }
 

    ktree.report_kmer_counts();
    
   
    return(0);
}