int64 HashGraph::Trim(int minLength) { vector<Contig> contigs; Assemble(contigs); int total = 0; #pragma omp parallel for for (int i = 0; i < (int)contigs.size(); ++i) { if (contigs[i].IsTangle() && contigs[i].Size() < kmerLength + minLength - 1) { Kmer kmer; for (int j = 0; j+1 < kmerLength; ++j) kmer.AddRight(contigs[i][j]); for (int j = kmerLength-1; j < contigs[i].Size(); ++j) { kmer.AddRight(contigs[i][j]); KmerNode *node = GetNode(kmer); if (node != NULL) node->SetDeadFlag(); } #pragma omp atomic ++total; } } Refresh(); LogMessage("trim %lld dead ends\n", total); return total; }
int64 HashGraph::RemoveLowCoverageContigs(double c) { vector<Contig> contigs; Assemble(contigs); int total = 0; #pragma omp parallel for for (int i = 0; i < (int)contigs.size(); ++i) { if (contigs[i].Coverage() < c) { Kmer kmer; for (int j = 0; j+1 < kmerLength; ++j) kmer.AddRight(contigs[i][j]); for (int j = kmerLength-1; j < contigs[i].Size(); ++j) { kmer.AddRight(contigs[i][j]); KmerNode *node = GetNode(kmer); if (node != NULL) node->SetDeadFlag(); } #pragma omp atomic ++total; } } Refresh(); return total; }
int KmerTree::aggregateThetas(){ try { vector<vector<int> > levelMatrix(numLevels+1); for(int i=0;i<tree.size();i++){ if (m->getControl_pressed()) { return 0; } levelMatrix[tree[i]->getLevel()].push_back(i); } for(int i=numLevels-1;i>0;i--) { if (m->getControl_pressed()) { return 0; } for(int j=0;j<levelMatrix[i].size();j++){ KmerNode* holder = tree[levelMatrix[i][j]]; tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs()); } } return 0; } catch(exception& e) { m->errorOut(e, "KmerTree", "aggregateThetas"); exit(1); } }
bool HashGraph::Check() { for (int64 i = 0; i < (int64)table_size; ++i) { HashNode *node = table[i]; while (node != NULL) { KmerNodeAdapter adapter(node); Kmer kmer = adapter.GetNode()->GetKmer(); for (int strand = 0; strand < 2; ++strand) { unsigned edges = adapter.OutEdges(); for (int x = 0; x < 4; ++x) { if (edges & (1 << x)) { Kmer next = kmer; next.AddRight(x); KmerNode *q = GetNode(next); if (q == NULL) { cout << "null fail" << endl; return false; } if (q->IsDead()) { cout << "deadend fail" << endl; return false; } KmerNodeAdapter adp(q, next); if (((1 << (3 - kmer.GetBase(0))) & adp.InEdges()) == 0) { cout << (int)kmer.GetBase(0) << " " << (int)adp.InEdges() << endl; cout << "no in edge fail" << endl; return false; } } } kmer.ReverseComplement(); adapter.ReverseComplement(); } node = node->next; } } return true; }
int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector<int>& sequence){ try { KmerNode* newNode; string taxonName = ""; int treePosition = 0; // the root is element 0 int level = 1; for(int i=0;i<taxonomy.length();i++){ // step through taxonomy string... if (m->getControl_pressed()) { break; } if(taxonomy[i] == ';'){ // looking for semicolons... if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->setControl_pressed(true); } int newIndex = tree[treePosition]->getChildIndex(taxonName);// look to see if your current node already // has a child with the new taxonName if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that else { // position in the tree int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one... tree[treePosition]->makeChild(taxonName, newChildIndex); newNode = new KmerNode(taxonName, level, kmerSize); newNode->setParent(treePosition); tree.push_back(newNode); treePosition = newChildIndex; } // sequence data to that node to update that node's theta - seems slow... taxonName = ""; // clear out the taxon name that we will build as we look level++; } // for a semicolon else{ taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon } } tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the return 0; } catch(exception& e) { m->errorOut(e, "KmerTree", "addTaxonomyToTree"); exit(1); } }
void HashGraph::AddInternalKmers(const Sequence &seq, int minCount) { if (seq.Size() <= kmerLength) return; vector<int> v; int count = 0; int sum = 0; Kmer kmer; for (int i = 0; i < kmerLength-1; ++i) kmer.AddRight(seq[i]); for (int i = kmerLength-1; i < seq.Size(); ++i) { kmer.AddRight(seq[i]); KmerNode *node = GetNode(kmer); if (node != NULL && node->Count() >= (unsigned)minCount) { sum += node->Count(); ++count; v.push_back(i); } } if (count > max(seq.Size() - kmerLength*2 + 1, (seq.Size() - kmerLength + 1)/2)) { Kmer kmer; for (int i = 0; i < kmerLength-1; ++i) kmer.AddRight(seq[i]); for (int i = kmerLength-1; i < seq.Size(); ++i) { kmer.AddRight(seq[i]); if (v.front() <= i && i <= v.back() && GetNode(kmer) == NULL) { KmerNodeAdapter adp(InsertKmer(kmer), kmer); if (i >= (int)kmerLength) { adp.AddInEdge(3 - seq[i-kmerLength]); } if (i+1 < seq.Size()) { adp.AddOutEdge(seq[i+1]); } } } } }
bool getNext() { if (! m_fp) { return false; } m_node.read(m_fp); ++m_num_read; if (m_num_read == m_kmer_count) { fclose(m_fp); m_fp = 0; } else if (m_num_read % 1000 == 0) { assert(fread(&m_test, sizeof(uint64_t), 1, m_fp) == 1); assert(m_test == m_sanity); } m_kmer = m_node.getKmer(); m_taxids = &m_node.getTaxIDs(); return true; }
void doit(string &taxtree_fn, string &kmer_db_fn, string &outfile, string &ranks_fn, size_t quit_early) { cout << "info: starting tax tree load from filename: " << taxtree_fn << endl; TaxTree<tid_T> tax_tree(taxtree_fn.c_str()); if (ranks_fn.size()) { tax_tree.setRanks(ranks_fn.c_str()); } cout << "info: tax tree size: " << tax_tree.size() << endl; cout << "info: start kmer DB load\n"; FILE *fp = Utils::openReadFile(kmer_db_fn.c_str()); KmerFileMetaData metadata; metadata.read(fp); uint64_t kmer_count = metadata.size(); uint64_t test, sanity = ~0; std::set<tid_T> tax_ids; cerr << "opening for writing: " << outfile.c_str() << endl; FILE *out_bin = fopen(outfile.c_str(), "wb"); assert(out_bin); //write metadata metadata.setVersion(TAX_HISTO_VERSION); metadata.write(out_bin); set<tid_T> tids_that_were_already_processed; KmerNode<tid_T> w; StopWatch c2; c2.start(); uint64_t j; uint64_t kmer; size_t count = 0; set<tid_T> bad_tid; string mer; uint64_t tid_ct = 0; cout << "starting; kmer count: " << kmer_count << endl; map<tid_T, set<tid_T> > species_test; map<int, int> all_species_test; uint64_t total_tid = 0; size_t singletons = 0; int ignore_kmer_cnt = 0; for (j=0; j<kmer_count; j++) { if (quit_early && j == quit_early) { cout << "quit_early: " << j << endl; break; } w.read(fp); const set<tid_T> tax_ids = w.getTaxIDs(); //allen99 quick hack to remove human k-mers bool doWrite=true; unordered_map<tid_T, set<tid_T> > tid_set; /* if( hasHuman( tax_ids, tax_tree ) ) { doWrite=false; ++ignore_kmer_cnt; } else { */ tax_tree.getLcaMap(tax_ids, tid_set); if (tid_set.size() == 0) { cout << "\nfrom tax_histo_new_fmt: WARNING: tid_set is empty; no entry will be written for kmer " << w.getKmer() << endl; cout << " this is for kmer #" << j+1 << " of " << kmer_count << endl; cout << " entries in tax_id set: "; for (typename set<tid_T>::iterator t = tax_ids.begin(); t != tax_ids.end(); t++) { cout << *t << " "; } cout << endl << endl; doWrite = false; } else { ++count; kmer= w.getKmer(); if(doWrite)assert(fwrite(&kmer,8, 1, out_bin) == 1); //write kmer uint16_t sz = tid_set.size(); if(doWrite) assert(fwrite(&sz,2, 1, out_bin) == 1); //write tid count tid_ct += tid_set.size(); if (sz == 1) singletons++; //write the tax IDs for (typename unordered_map<tid_T, set<tid_T> >::const_iterator t = tid_set.begin(); t != tid_set.end(); t++) { tid_T tid = t->first; if(doWrite) assert(fwrite(&tid,4, 1, out_bin) == 1); total_tid++; } if ((count) % TAX_HISTO_SANITY_COUNT ==0) { assert(fwrite(&sanity, 8, 1, out_bin) == 1); } } if ((j+1) % KMER_SANITY_COUNT == 0) { assert(fread(&test, 8, 1, fp) == 1); assert(test == sanity); } // } // else for has human } cout << "total taxids: " << total_tid << "\nrem kmer cnt: "<<ignore_kmer_cnt<<endl; cout << "singletons: " << singletons << endl; fseek(out_bin, 0, SEEK_SET); metadata.setSize(count); metadata.write(out_bin); fclose(fp); fclose(out_bin); double tm = c2.stop(); cout << endl << "total annotate time: " << tm << endl; cout << "num mapping kmers processed: " << count << endl; cout << "kmers per second (not counting startup time): " << (double)kmer_count/tm << endl; }