int64_t InsertIterativeKmers(const HashGraph &old_hash_graph, const Sequence &seq, HashGraph &hash_graph, int kmer_count) { int old_kmer_size = old_hash_graph.kmer_size(); int new_kmer_size = hash_graph.kmer_size(); Kmer old_kmer(old_kmer_size); Kmer new_kmer(new_kmer_size); int length = 0; int count = 0; int num_iterative_kmers = 0; for (uint32_t j = 0; j < seq.size(); ++j) { old_kmer.ShiftAppend(seq[j]); new_kmer.ShiftAppend(seq[j]); length = (seq[j] < 4) ? length + 1 : 0; count = (length >= old_kmer_size && old_hash_graph.FindVertex(old_kmer) != NULL) ? count+1 : 0; if (count >= new_kmer_size - old_kmer_size + 1) { ++num_iterative_kmers; HashGraphVertex *vertex = hash_graph.InsertVertex(new_kmer, kmer_count); HashGraphVertexAdaptor adaptor(vertex, new_kmer != vertex->kmer()); if (length > new_kmer_size && seq[j-new_kmer_size] < 4) adaptor.in_edges().Add(3 - seq[j-new_kmer_size]); if (j+1 < seq.size() && seq[j+1] < 4) adaptor.out_edges().Add(seq[j+1]); } } return num_iterative_kmers; }
void InsertExistKmers(AssemblyInfo &assembly_info, HashGraph &hash_graph) { deque<ShortSequence> &reads = assembly_info.reads; deque<Sequence> &long_reads = assembly_info.long_reads; vector<bool> &read_flags = assembly_info.read_flags; vector<bool> &long_read_flags = assembly_info.long_read_flags; //#pragma omp parallel for // for (int64_t i = 0; i < (int64_t)assembly_info.ref_contigs.size(); ++i) // hash_graph.InsertExistKmers(assembly_info.ref_contigs[i]); //hash_graph.ClearCount(); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)reads.size(); ++i) { if (!read_flags[i]) continue; Sequence seq(reads[i]); //read_flags[i] = (hash_graph.InsertExistKmers(seq) != 0); hash_graph.InsertExistKmers(seq); } #pragma omp parallel for for (int64_t i = 0; i < (int64_t)long_reads.size(); ++i) { if (!long_read_flags[i]) continue; //long_read_flags[i] = (hash_graph.InsertExistKmers(long_reads[i]) != 0); hash_graph.InsertExistKmers(long_reads[i]); } }
void ReadKmerFile(const std::string &kmer_file, HashGraph &hash_graph) { ifstream fkmer(kmer_file.c_str(), ios_base::binary | ios_base::in); fkmer.seekg(0, ios_base::end); int64_t num_nodes = fkmer.tellg() / sizeof(HashGraphVertex); hash_graph.reserve(num_nodes); fkmer.seekg(0, ios_base::beg); fkmer >> hash_graph; }
void InsertInternalKmers(AssemblyInfo &assembly_info, HashGraph &hash_graph, int min_count) { deque<ShortSequence> &reads = assembly_info.reads; deque<Sequence> &long_reads = assembly_info.long_reads; #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)reads.size(); ++i) { Sequence seq(reads[i]); hash_graph.InsertInternalKmers(seq, min_count); } #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)long_reads.size(); ++i) hash_graph.InsertInternalKmers(long_reads[i], min_count); hash_graph.RestoreAndMergeEdges(); hash_graph.RefreshEdges(); }
void Assemble(HashGraph &hash_graph) { cout << "kmers " << hash_graph.num_vertices() << " "<< hash_graph.num_edges() << endl; int kmer_size = hash_graph.kmer_size(); double min_cover = max(1, (kmer_size == option.mink ? option.min_count : option.min_support)); Histgram<int> hist = hash_graph.coverage_histgram(); double expected_coverage = hist.mean(); deque<Sequence> contigs; deque<ContigInfo> contig_infos; hash_graph.Assemble(contigs, contig_infos); hash_graph.clear(); { HashGraph tmp_hash_graph; tmp_hash_graph.swap(hash_graph); } ContigGraph contig_graph(kmer_size, contigs, contig_infos); contigs.clear(); contig_infos.clear(); contig_graph.RemoveDeadEnd(option.min_contig); int bubble = contig_graph.RemoveBubble(); cout << "merge bubble " << bubble << endl; contig_graph.MergeSimilarPath(); if (!option.is_no_coverage) contig_graph.RemoveLocalLowCoverage(min_cover, option.min_contig, 0.1); contig_graph.SortVertices(); contig_graph.GetContigs(contigs, contig_infos); WriteSequence(option.graph_file(kmer_size), contigs); contigs.clear(); contig_infos.clear(); if (!option.is_no_coverage) { double ratio = (kmer_size < option.maxk) ? 0.5 : 0.2; if (ratio < 2.0 / expected_coverage) ratio = 2.0 / expected_coverage; contig_graph.IterateLocalCoverage(option.min_contig, ratio, min_cover, 1e100, 1.1); contig_graph.MergeSimilarPath(); } deque<Sequence> multi_contigs; deque<ContigInfo> multi_contig_infos; contig_graph.SortVertices(); contig_graph.GetContigs(multi_contigs, multi_contig_infos); PrintN50(multi_contigs); //WriteSequence(option.contig_file(kmer_size), multi_contigs); WriteContig(option.contig_file(kmer_size), multi_contigs, multi_contig_infos, FormatString("contig-%d", kmer_size)); //WriteContigInfo(option.contig_info_file(kmer_size), multi_contig_infos); }
void IterateHashGraph(AssemblyInfo &assembly_info, int new_kmer_size, int min_support, HashGraph &hash_graph, deque<Sequence> &old_contigs) { int old_kmer_size = hash_graph.kmer_size(); deque<ShortSequence> &reads = assembly_info.reads; deque<Sequence> &long_reads = assembly_info.long_reads; vector<bool> &read_flags = assembly_info.read_flags; vector<bool> &long_read_flags = assembly_info.long_read_flags; #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)old_contigs.size(); ++i) hash_graph.InsertUncountKmers(old_contigs[i]); hash_graph.AddAllEdges(); deque<Sequence> contigs; hash_graph.Assemble(contigs); hash_graph.clear(); uint64_t sum = 0; int d = new_kmer_size - old_kmer_size; for (unsigned i = 0; i < contigs.size(); ++i) { if ((int)contigs[i].size() - old_kmer_size + 1 >= 2*d + 2) sum += 2*d + 2; else if ((int)contigs[i].size() >= old_kmer_size) sum += contigs[i].size() - old_kmer_size + 1; } HashGraph old_hash_graph(old_kmer_size); old_hash_graph.reserve(sum); #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)contigs.size(); ++i) { Sequence seq; seq.Assign(contigs[i], 0, min(new_kmer_size, (int)contigs[i].size())); old_hash_graph.InsertKmers(seq); seq.Assign(contigs[i], max(0, (int)contigs[i].size() - new_kmer_size), min(new_kmer_size, (int)contigs[i].size())); old_hash_graph.InsertKmers(seq); } //cout << "old kmer " << old_hash_graph.num_vertices() << endl; hash_graph.set_kmer_size(new_kmer_size); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)reads.size(); ++i) { if (!read_flags[i]) continue; Sequence seq(reads[i]); InsertIterativeKmers(old_hash_graph, seq, hash_graph); } #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)long_reads.size(); ++i) { if (!long_read_flags[i]) continue; InsertIterativeKmers(old_hash_graph, long_reads[i], hash_graph); } #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)assembly_info.ref_contigs.size(); ++i) InsertIterativeKmers(old_hash_graph, assembly_info.ref_contigs[i], hash_graph); old_hash_graph.clear(); { HashGraph tmp_hash_graph; tmp_hash_graph.swap(old_hash_graph); } hash_graph.RefreshVertices(min_support); #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)old_contigs.size(); ++i) hash_graph.InsertUncountKmers(old_contigs[i]); hash_graph.ClearCount(); InsertExistKmers(assembly_info, hash_graph); }
int main(int argc, char** argv) { if(argc!=8) { cout<<"querytale [configuration file] [query file] [orthology file] [table Prefix] " "[approximate percentage] [percentage of important nodes] [TOP_K] " "\n"; exit(-1); } Config cf; cf.readFile(argv[1]); string tablePrefix=string(argv[4]); // cf.printConfig(); QueryParams p; p.p_miss=atof(argv[5]); if(p.p_miss>1 || p.p_miss<0) { cout<<"Error: Unrecognized parameter: "<<p.p_miss<<endl; exit(-1); } float p_imp=atof(argv[6]); if(p_imp>1 || p_imp<0) { cout<<"Error: Unrecognized parameter: "<<p_imp<<endl; exit(-1); } p.top_k=atoi(argv[7]); if(p.top_k<0) { cout<<"Error: Unrecognized parameter: "<<p.top_k<<endl; exit(-1); } FILE* fin=fopen(argv[2], "r"); if(!fin) { cout << "Error: Unable to open file |" << argv[2] << "|\n"; exit(-1); } string filePrefix=string(argv[2]); int i=filePrefix.find_last_of("."); if(i>0) filePrefix=filePrefix.substr(0, i); FILE* orthf=NULL; string orth_file_name=string(argv[3]); orthf=fopen(orth_file_name.c_str(), "r"); if(!orthf) { cout << "Error: Unable to open file |" << orth_file_name << "|\n"; exit(-1); } PGDB* db=new PGDB((char*)cf.dbname.c_str(), (char*)cf.dbuser.c_str(), (char*)cf.dbpwd.c_str()); if(db==NULL) { cout<<"Error: Can not create db"<<endl; exit(-1); } debug(15, "db connected!\n"); Ortholog orth; orth.buildOrthologMap(orthf); Query qry(db); qry.setTablePrefix(tablePrefix); HashGraph* Q; OrthologInfoList orthinfolist; MicroTimer timer; timer.reset(); // Q=readGraphSAGAFormat(fin, orth, orthinfolist); Q=readGraphGDFFormat(fin, "query", orth, orthinfolist); if(Q==NULL) { cout<<"Error: Can not create graph\n"; exit(-1); } p.n_imp=(int)(Q->n()*p_imp); qry.setQueryParams(p); debug(46, "successfully read input file\n"); debug(47, "successfully read input file2\n"); assert(Q!=NULL); qry.setGraph(Q, &orthinfolist); debug(46, "before match, everything is fine\n"); debug(47, "before match, everything is fine2\n"); qry.performQuery(); cout<<"-- # of Matches: "<<qry.mlist.size()<<endl; for(unsigned int i=0; i<qry.mlist.size(); i++) { char filename[200]; int dbgid=qry.mlist[i]->dbgid; OrthologInfoList orthinfolist_db; HashGraph* DBG=readGraphFromDB(db, tablePrefix, dbgid, orthinfolist_db); sprintf(filename, "%s_tale_%s_%d.gdf", filePrefix.c_str(), DBG->getGraphAttrs()->find("name")->second.value.c_str(), i+1); cout<<" Match "<<i+1<<": "<<filename<<endl; ofstream of(filename); if(!of.is_open()) { cout<<"Error: Can not open file "<<filename<<endl; exit(-1); } //writeMatch(cout, *(qry.mlist[i])); writeMatchGDF(of, Q, &orthinfolist, DBG, &orthinfolist_db, *(qry.mlist[i]), orth); of.close(); delete qry.mlist[i]; delete DBG; } cout<<"-- Execution Time (s): "<<fixed<<setprecision(2)<<timer.stop_seconds()<<endl; if(fin) fclose(fin); if(orthf!=NULL) fclose(orthf); delete Q; if(db!=NULL) { delete db; db=NULL; } return 0; }
namespace ics { HashGraph<int> g; ics::HashGraph<int>& prompt_graph() { static ics::HashGraph<int> g2; g2.clear(); for (;;) { std::string n = ics::prompt_string(" Enter degree 0 node name (QUIT to quit)"); if (n == "QUIT") break; g2.add_node(n); } for (;;) { std::string o = ics::prompt_string(" Enter origin node name (QUIT to quit)"); if (o == "QUIT") break; std::string d = ics::prompt_string(" Enter origin node name (QUIT to quit)"); if (d == "QUIT") break; int v = ics::prompt_int(" Enter edge value"); g2.add_edge(o,d,v); } return g2; } std::string menu_prompt (std::string preface) { std::cout << std::endl; std::cout << g << std::endl; std::cout << preface+"\nMutators Accessors General" << std::endl; std::cout << preface+" an - add_node m - empty l - load from file" << std::endl; std::cout << preface+" ae - add_edge #n - node_count s - store to file" << std::endl; std::cout << preface+" rn - remove_node #e - edge_count q - quit" << std::endl; std::cout << preface+" re - remove_edge n - has_node" << std::endl; std::cout << preface+" x - clear e - has_edge" << std::endl; std::cout << preface+" = - = v - edge_value" << std::endl; std::cout << preface+" i - in_degree" << std::endl; std::cout << preface+" o - out_degree" << std::endl; std::cout << preface+" d - degree" << std::endl; std::cout << preface+" < - <<" << std::endl; std::cout << preface+" r - relations" << std::endl; std::string allowable[] = {"an","ae","rn","re","x","=","m","#n","#e","n","e","v","i","o","d","<","r","l","s","q",""}; return ics::prompt_string("\n"+preface+"Enter graph command","",allowable); } void process_commands(std::string preface) { for (;;) try { std::string command = menu_prompt(preface); if (command == "an") { std::string n = ics::prompt_string(" Enter node name"); g.add_node(n); } else if (command == "ae") { std::string o = ics::prompt_string(" Enter origin node name"); std::string d = ics::prompt_string(" Enter destination node name"); int v = ics::prompt_int (" Enter value for this edge "); g.add_edge(o,d,v); } else if (command == "rn") { std::string n = ics::prompt_string(" Enter node name"); g.remove_node(n); } else if (command == "re") { std::string o = ics::prompt_string(" Enter origin node name"); std::string d = ics::prompt_string(" Enter destination node name"); g.remove_edge(o,d); } else if (command == "x") g.clear(); else if (command == "=") { ics::HashGraph<int> g2(ics::prompt_graph()); g = g2; } else if (command == "m") std::cout << " empty() = " << g.empty() << std::endl; else if (command == "#n") std::cout << " node_count() = " << g.node_count() << std::endl; else if (command == "#e") std::cout << " edge_count() = " << g.edge_count() << std::endl; else if (command == "n") { std::string n = ics::prompt_string(" Enter node name"); std::cout << " has_node(...) = " << g.has_node(n) << std::endl; } else if (command == "e") { std::string o = ics::prompt_string(" Enter origin node name"); std::string d = ics::prompt_string(" Enter destination node name"); std::cout << " has_edge(...) = " << g.has_edge(o,d) << std::endl; } else if (command == "v") { std::string o = ics::prompt_string(" Enter origin node name"); std::string d = ics::prompt_string(" Enter destination node name"); std::cout << " edge_value(...) = " << g.edge_value(o,d) << std::endl; } else if (command == "i") { std::string n = ics::prompt_string(" Enter node name"); std::cout << " in_degree(...) = " << g.in_degree(n) << std::endl; } else if (command == "o") { std::string n = ics::prompt_string(" Enter node name"); std::cout << " out_degree(...) = " << g.out_degree(n) << std::endl; } else if (command == "d") { std::string n = ics::prompt_string(" Enter node name"); std::cout << " degree(...) = " << g.degree(n) << std::endl; } else if (command == "<") std::cout << preface+" << = " << g << std::endl; else if (command == "r") { std::cout << " g == g = " << (g == g) << std::endl; std::cout << " g != g = " << (g != g) << std::endl; ics::HashGraph<int> g2(ics::prompt_graph()); std::cout << " g == g2 = " << (g == g2) << std::endl; std::cout << " g != g2 = " << (g != g2) << std::endl; } else if (command == "l") { std::string separator = prompt_string(" Enter separator used in file"); std::ifstream in_file; ics::safe_open(in_file," Enter file name to read graph from","graph.txt"); g.load(in_file,separator); } else if (command == "s") { std::string separator = prompt_string(" Enter separator to use in file"); std::ofstream out_file; out_file.open(ics::prompt_string(" Enter file name to store graph in").c_str()); if (out_file.fail()) std::cout << " Illegal file name" << std :: endl; else g.store(out_file,separator); } else if (command == "q") break; else std::cout << preface+"\""+command+"\" is unknown command" << std::endl; } catch (ics::IcsError& e) { std::cout << preface+" " << e.what() << std::endl; } } }
void process_commands(std::string preface) { for (;;) try { std::string command = menu_prompt(preface); if (command == "an") { std::string n = ics::prompt_string(" Enter node name"); g.add_node(n); } else if (command == "ae") { std::string o = ics::prompt_string(" Enter origin node name"); std::string d = ics::prompt_string(" Enter destination node name"); int v = ics::prompt_int (" Enter value for this edge "); g.add_edge(o,d,v); } else if (command == "rn") { std::string n = ics::prompt_string(" Enter node name"); g.remove_node(n); } else if (command == "re") { std::string o = ics::prompt_string(" Enter origin node name"); std::string d = ics::prompt_string(" Enter destination node name"); g.remove_edge(o,d); } else if (command == "x") g.clear(); else if (command == "=") { ics::HashGraph<int> g2(ics::prompt_graph()); g = g2; } else if (command == "m") std::cout << " empty() = " << g.empty() << std::endl; else if (command == "#n") std::cout << " node_count() = " << g.node_count() << std::endl; else if (command == "#e") std::cout << " edge_count() = " << g.edge_count() << std::endl; else if (command == "n") { std::string n = ics::prompt_string(" Enter node name"); std::cout << " has_node(...) = " << g.has_node(n) << std::endl; } else if (command == "e") { std::string o = ics::prompt_string(" Enter origin node name"); std::string d = ics::prompt_string(" Enter destination node name"); std::cout << " has_edge(...) = " << g.has_edge(o,d) << std::endl; } else if (command == "v") { std::string o = ics::prompt_string(" Enter origin node name"); std::string d = ics::prompt_string(" Enter destination node name"); std::cout << " edge_value(...) = " << g.edge_value(o,d) << std::endl; } else if (command == "i") { std::string n = ics::prompt_string(" Enter node name"); std::cout << " in_degree(...) = " << g.in_degree(n) << std::endl; } else if (command == "o") { std::string n = ics::prompt_string(" Enter node name"); std::cout << " out_degree(...) = " << g.out_degree(n) << std::endl; } else if (command == "d") { std::string n = ics::prompt_string(" Enter node name"); std::cout << " degree(...) = " << g.degree(n) << std::endl; } else if (command == "<") std::cout << preface+" << = " << g << std::endl; else if (command == "r") { std::cout << " g == g = " << (g == g) << std::endl; std::cout << " g != g = " << (g != g) << std::endl; ics::HashGraph<int> g2(ics::prompt_graph()); std::cout << " g == g2 = " << (g == g2) << std::endl; std::cout << " g != g2 = " << (g != g2) << std::endl; } else if (command == "l") { std::string separator = prompt_string(" Enter separator used in file"); std::ifstream in_file; ics::safe_open(in_file," Enter file name to read graph from","graph.txt"); g.load(in_file,separator); } else if (command == "s") { std::string separator = prompt_string(" Enter separator to use in file"); std::ofstream out_file; out_file.open(ics::prompt_string(" Enter file name to store graph in").c_str()); if (out_file.fail()) std::cout << " Illegal file name" << std :: endl; else g.store(out_file,separator); } else if (command == "q") break; else std::cout << preface+"\""+command+"\" is unknown command" << std::endl; } catch (ics::IcsError& e) { std::cout << preface+" " << e.what() << std::endl; } }
void Assemble(HashGraph &hash_graph) { cout << "kmers " << hash_graph.num_vertices() << " "<< hash_graph.num_edges() << endl; int kmer_size = hash_graph.kmer_size(); double min_cover = max(1, (kmer_size == option.mink ? option.min_count : option.min_support)); Histgram<int> hist = hash_graph.coverage_histgram(); //double expected_coverage = hist.mean(); deque<Sequence> contigs; deque<ContigInfo> contig_infos; hash_graph.Assemble(contigs, contig_infos); hash_graph.clear(); { HashGraph tmp_hash_graph; tmp_hash_graph.swap(hash_graph); } ContigGraph contig_graph(kmer_size, contigs, contig_infos); contigs.clear(); contig_infos.clear(); if (!option.is_no_coverage) { contig_graph.RemoveStandAlone(kmer_size); int bubble = contig_graph.RemoveBubble(); cout << "merge bubble " << bubble << endl; contig_graph.RemoveLocalLowCoverage(min_cover, option.min_contig, 0.1); } contig_graph.SortVertices(); contig_graph.GetContigs(contigs, contig_infos); WriteSequence(option.graph_file(kmer_size), contigs); contigs.clear(); contig_infos.clear(); if (!option.is_no_coverage) { double ratio = 0.25; deque<Sequence> multi_contigs; deque<ContigInfo> multi_contig_infos; contig_graph.GetContigs(multi_contigs, multi_contig_infos); PrintN50(multi_contigs); contig_graph.Trim(10); contig_graph.MergeSimilarPath(); contig_graph.GetContigs(multi_contigs, multi_contig_infos); contig_graph.InitializeTable(); contig_graph.IterateComponentCoverage2(option.min_contig, ratio, min_cover, 1e100, 1.1, max_component_size); contig_graph.GetContigs(multi_contigs, multi_contig_infos); contig_graph.Trim(10); contig_graph.Prune(kmer_size); contig_graph.GetContigs(multi_contigs, multi_contig_infos); contig_graph.MergeSimilarPath(); } deque<Sequence> multi_contigs; deque<ContigInfo> multi_contig_infos; contig_graph.SortVertices(); contig_graph.GetContigs(multi_contigs, multi_contig_infos); PrintN50(multi_contigs); WriteSequence(option.contig_file(kmer_size), multi_contigs); WriteContigInfo(option.contig_info_file(kmer_size), multi_contig_infos); deque<Sequence> transcripts; FindIsoforms(contig_graph, transcripts); int index = 0; for (unsigned i = 0; i < transcripts.size(); ++i) { if (transcripts[i].size() >= 300) transcripts[index++] = transcripts[i]; } transcripts.resize(index); PrintN50(transcripts); WriteSequence(option.transcript_file(kmer_size), transcripts, FormatString("transcript-%d", kmer_size)); }