Пример #1
0
int64 HashGraph::Trim(int minLength)
{
    vector<Contig> contigs;
    Assemble(contigs);

    int total = 0;
#pragma omp parallel for
    for (int i = 0; i < (int)contigs.size(); ++i)
    {
        if (contigs[i].IsTangle() && contigs[i].Size() < kmerLength + minLength - 1)
        {
            Kmer kmer;
            for (int j = 0; j+1 < kmerLength; ++j)
                kmer.AddRight(contigs[i][j]);
            for (int j = kmerLength-1; j < contigs[i].Size(); ++j)
            {
                kmer.AddRight(contigs[i][j]);
                KmerNode *node = GetNode(kmer);
                if (node != NULL)
                    node->SetDeadFlag();
            }

#pragma omp atomic
            ++total;
        }
    }

    Refresh();

    LogMessage("trim %lld dead ends\n", total);

    return total;
}
Пример #2
0
int64 HashGraph::RemoveLowCoverageContigs(double c)
{
    vector<Contig> contigs;
    Assemble(contigs);

    int total = 0;
#pragma omp parallel for
    for (int i = 0; i < (int)contigs.size(); ++i)
    {
        if (contigs[i].Coverage() < c)
        {
            Kmer kmer;
            for (int j = 0; j+1 < kmerLength; ++j)
                kmer.AddRight(contigs[i][j]);
            for (int j = kmerLength-1; j < contigs[i].Size(); ++j)
            {
                kmer.AddRight(contigs[i][j]);
                KmerNode *node = GetNode(kmer);
                if (node != NULL)
                    node->SetDeadFlag();
            }

#pragma omp atomic
            ++total;
        }
    }

    Refresh();

    return total;
}
Пример #3
0
int KmerTree::aggregateThetas(){
	try {
        vector<vector<int> > levelMatrix(numLevels+1);
        
        for(int i=0;i<tree.size();i++){
            if (m->getControl_pressed()) { return 0; }
            levelMatrix[tree[i]->getLevel()].push_back(i);
        }
        
        for(int i=numLevels-1;i>0;i--) {
            if (m->getControl_pressed()) { return 0; }
            
            for(int j=0;j<levelMatrix[i].size();j++){
                
                KmerNode* holder = tree[levelMatrix[i][j]];
                
                tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());				
            }
        }
        
        return 0;
	}
	catch(exception& e) {
		m->errorOut(e, "KmerTree", "aggregateThetas");
		exit(1);
	}
}
Пример #4
0
bool HashGraph::Check()
{
    for (int64 i = 0; i < (int64)table_size; ++i)
    {
        HashNode *node = table[i];
        while (node != NULL)
        {
            KmerNodeAdapter adapter(node);
            Kmer kmer = adapter.GetNode()->GetKmer();
            for (int strand = 0; strand < 2; ++strand)
            {
                unsigned edges = adapter.OutEdges();

                for (int x = 0; x < 4; ++x)
                {
                    if (edges & (1 << x))
                    {
                        Kmer next = kmer;
                        next.AddRight(x);
                        KmerNode *q = GetNode(next);
                        if (q == NULL)
                        {
                            cout << "null fail" << endl;
                            return false;
                        }

                        if (q->IsDead())
                        {
                            cout << "deadend fail" << endl;
                            return false;
                        }

                        KmerNodeAdapter adp(q, next);

                        if (((1 << (3 - kmer.GetBase(0))) & adp.InEdges()) == 0)
                        {
                            cout << (int)kmer.GetBase(0) << " " << (int)adp.InEdges() << endl;
                            cout << "no in edge fail" << endl;
                            return false;
                        }
                    }
                }

                kmer.ReverseComplement();
                adapter.ReverseComplement();
            }

            node = node->next;
        }
    }

    return true;
}
Пример #5
0
int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector<int>& sequence){
	try {
        KmerNode* newNode;
        string taxonName = "";
        int treePosition = 0;							//	the root is element 0
        
        
        int level = 1;
        
        for(int i=0;i<taxonomy.length();i++){			//	step through taxonomy string...
            
            if (m->getControl_pressed()) { break; }
            if(taxonomy[i] == ';'){						//	looking for semicolons...
                
                if (taxonName == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); m->setControl_pressed(true); }
                
                int newIndex = tree[treePosition]->getChildIndex(taxonName);//	look to see if your current node already
                //	   has a child with the new taxonName
                if(newIndex != -1)	{	treePosition = newIndex;	}		//	if you've seen it before, jump to that
                else {														//	   position in the tree
                    int newChildIndex = (int)tree.size();					//	otherwise, we'll have to create one...
                    tree[treePosition]->makeChild(taxonName, newChildIndex);
                    
                    newNode = new KmerNode(taxonName, level, kmerSize);
                    
                    newNode->setParent(treePosition);
                    
                    tree.push_back(newNode);
                    treePosition = newChildIndex;
                }
                
                //	sequence data to that node to update that node's theta - seems slow...				
                taxonName = "";								//	clear out the taxon name that we will build as we look 
                level++;
                
            }												//	for a semicolon
            else{
                taxonName += taxonomy[i];					//	keep adding letters until we reach a semicolon
            }
        }
        
        tree[treePosition]->loadSequence(sequence);	//	now that we've gotten to the correct node, add the
        
        return 0;
    }
	catch(exception& e) {
		m->errorOut(e, "KmerTree", "addTaxonomyToTree");
		exit(1);
	}
	
}
Пример #6
0
void HashGraph::AddInternalKmers(const Sequence &seq, int minCount)
{
    if (seq.Size() <= kmerLength)
        return;

    vector<int> v;
    int count = 0;
    int sum = 0;
    Kmer kmer;
    for (int i = 0; i < kmerLength-1; ++i)
        kmer.AddRight(seq[i]);
    for (int i = kmerLength-1; i < seq.Size(); ++i)
    {
        kmer.AddRight(seq[i]);

        KmerNode *node = GetNode(kmer);
        if (node != NULL && node->Count() >= (unsigned)minCount)
        {
            sum += node->Count();
            ++count;
            v.push_back(i);
        }
    }

    if (count > max(seq.Size() - kmerLength*2 + 1, (seq.Size() - kmerLength + 1)/2))
    {
        Kmer kmer;
        for (int i = 0; i < kmerLength-1; ++i)
            kmer.AddRight(seq[i]);
        for (int i = kmerLength-1; i < seq.Size(); ++i)
        {
            kmer.AddRight(seq[i]);

            if (v.front() <= i && i <= v.back() && GetNode(kmer) == NULL)
            {
                KmerNodeAdapter adp(InsertKmer(kmer), kmer);
                if (i >= (int)kmerLength)
                {
                    adp.AddInEdge(3 - seq[i-kmerLength]);
                }

                if (i+1 < seq.Size())
                {
                    adp.AddOutEdge(seq[i+1]);
                }
            }
        }
    }
}
Пример #7
0
 bool getNext() {
   if (! m_fp) {
     return false;
   }
   m_node.read(m_fp);
   ++m_num_read;
   if (m_num_read == m_kmer_count) {
     fclose(m_fp);
     m_fp = 0;
   } else if (m_num_read % 1000 == 0) {
     assert(fread(&m_test, sizeof(uint64_t), 1, m_fp) == 1);
     assert(m_test == m_sanity);
   }
   m_kmer = m_node.getKmer();
   m_taxids = &m_node.getTaxIDs();
   return true;
 }
Пример #8
0
void doit(string &taxtree_fn, string &kmer_db_fn, string &outfile, string &ranks_fn, size_t quit_early) {
    cout << "info: starting tax tree load from filename: " << taxtree_fn << endl;
    TaxTree<tid_T> tax_tree(taxtree_fn.c_str());
    if (ranks_fn.size()) {
        tax_tree.setRanks(ranks_fn.c_str());
    }
    cout << "info: tax tree size: " << tax_tree.size() << endl;
    
    cout << "info: start kmer DB load\n";
    FILE *fp = Utils::openReadFile(kmer_db_fn.c_str());
    KmerFileMetaData metadata;
    metadata.read(fp);
    
    
    
    
    uint64_t kmer_count = metadata.size();
    
    uint64_t test, sanity = ~0;
    std::set<tid_T> tax_ids;
    
    cerr << "opening for writing: " << outfile.c_str() << endl;
    FILE *out_bin = fopen(outfile.c_str(), "wb");
    assert(out_bin);
    
    //write metadata
    metadata.setVersion(TAX_HISTO_VERSION);
    metadata.write(out_bin);
    
    set<tid_T> tids_that_were_already_processed;
    KmerNode<tid_T> w;
    StopWatch c2;
    c2.start();
    uint64_t j;
    uint64_t kmer;
    size_t count = 0;
    
    set<tid_T> bad_tid;
    string mer;
    uint64_t tid_ct = 0;
    cout << "starting; kmer count: " << kmer_count << endl;
    
    map<tid_T, set<tid_T> > species_test;
    
    map<int, int> all_species_test;
    
    uint64_t total_tid = 0;
    
    size_t singletons = 0;
    
    int ignore_kmer_cnt = 0;
    
    for (j=0; j<kmer_count; j++) {
        
        if (quit_early && j == quit_early) {
            cout << "quit_early: " << j << endl;
            break;
        }
        
        w.read(fp);
        
        const set<tid_T> tax_ids = w.getTaxIDs();
        
        //allen99 quick hack to remove human k-mers
        bool doWrite=true;
        
        unordered_map<tid_T, set<tid_T> > tid_set;
        
        /*
         
         if( hasHuman( tax_ids, tax_tree ) ) {
         doWrite=false;
         ++ignore_kmer_cnt;
         } else {
         */
        
        tax_tree.getLcaMap(tax_ids, tid_set);
        
        
        
        
        if (tid_set.size() == 0) {
            cout << "\nfrom tax_histo_new_fmt: WARNING: tid_set is empty; no entry will be written for kmer " << w.getKmer() << endl;
            cout << "    this is for kmer #" << j+1 << " of " << kmer_count << endl;
            cout << "    entries in tax_id set: ";
            for (typename set<tid_T>::iterator t = tax_ids.begin(); t != tax_ids.end(); t++) {
                cout << *t << " ";
            }
            cout <<  endl << endl;
            doWrite = false;
            
        } else  {
            
            ++count;
            
            
            kmer= w.getKmer();
            
            
            if(doWrite)assert(fwrite(&kmer,8, 1, out_bin) == 1);  //write kmer
            uint16_t sz = tid_set.size();
            if(doWrite) assert(fwrite(&sz,2, 1, out_bin) == 1); //write tid count
            tid_ct += tid_set.size();
            
            
            
            if (sz == 1)
                singletons++;
            
            //write the tax IDs
            
            for (typename unordered_map<tid_T, set<tid_T> >::const_iterator t = tid_set.begin(); t != tid_set.end(); t++) {
                tid_T tid = t->first;
                
                if(doWrite) assert(fwrite(&tid,4, 1, out_bin) == 1);
                
                total_tid++;
            }
            
            if ((count) % TAX_HISTO_SANITY_COUNT ==0) {
                
                
                assert(fwrite(&sanity, 8, 1, out_bin) == 1);
            }
            
            
        }
        
        
        if ((j+1) % KMER_SANITY_COUNT == 0) {
            assert(fread(&test, 8, 1, fp) == 1);
            assert(test == sanity);
        }
        
        
        //    }  // else for has human
        
    }
    
    cout << "total taxids: " << total_tid << "\nrem kmer cnt: "<<ignore_kmer_cnt<<endl;
    cout << "singletons: " << singletons << endl;
    
    
    fseek(out_bin, 0, SEEK_SET);
    
    
    
    metadata.setSize(count);
    metadata.write(out_bin);
    
    fclose(fp);
    fclose(out_bin);
    
    double tm = c2.stop();
    cout << endl << "total annotate time: " << tm << endl;
    cout << "num mapping kmers processed: " << count << endl;
    cout << "kmers per second (not counting startup time): " << (double)kmer_count/tm << endl;
}