// stores kmers of size kmer_size with stride over paths in graphs in the index void VGset::index_kmers(Index& index, int kmer_size, int edge_max, int stride, bool allow_negatives) { // create a vector of output files // as many as there are threads for_each([&index, kmer_size, edge_max, stride, allow_negatives, this](VG* g) { int thread_count; #pragma omp parallel { #pragma omp master thread_count = omp_get_num_threads(); } // these are indexed by thread vector<vector<KmerMatch> > buffer; for (int i = 0; i < thread_count; ++i) { buffer.emplace_back(); } // how many kmer entries to hold onto uint64_t buffer_max_size = 100000; // 100k // this may need a guard auto write_buffer = [&index](int tid, vector<KmerMatch>& buf) { rocksdb::WriteBatch batch; function<void(KmerMatch&)> keep_kmer = [&index, &batch](KmerMatch& k) { index.batch_kmer(k.sequence(), k.node_id(), k.position(), batch); }; std::for_each(buf.begin(), buf.end(), keep_kmer); rocksdb::Status s = index.db->Write(rocksdb::WriteOptions(), &batch); }; auto cache_kmer = [&buffer, &buffer_max_size, &write_buffer, this](string& kmer, Node* n, int p, list<Node*>& path, VG& graph) { if (allATGC(kmer)) { int tid = omp_get_thread_num(); // note that we don't need to guard this // each thread has its own buffer! auto& buf = buffer[tid]; KmerMatch k; k.set_sequence(kmer); k.set_node_id(n->id()); k.set_position(p); buf.push_back(k); if (buf.size() > buffer_max_size) { write_buffer(tid, buf); buf.clear(); } } }; g->create_progress("indexing kmers of " + g->name, buffer.size()); g->for_each_kmer_parallel(kmer_size, edge_max, cache_kmer, stride, false, allow_negatives); g->destroy_progress(); g->create_progress("flushing kmer buffers " + g->name, g->size()); int tid = 0; #pragma omp parallel for schedule(dynamic) for (int i = 0; i < buffer.size(); ++i) { auto& buf = buffer[i]; write_buffer(i, buf); g->update_progress(tid); } buffer.clear(); g->destroy_progress(); }); index.remember_kmer_size(kmer_size); }