void Assemble(HashGraph &hash_graph) { cout << "kmers " << hash_graph.num_vertices() << " "<< hash_graph.num_edges() << endl; int kmer_size = hash_graph.kmer_size(); double min_cover = max(1, (kmer_size == option.mink ? option.min_count : option.min_support)); Histgram<int> hist = hash_graph.coverage_histgram(); double expected_coverage = hist.mean(); deque<Sequence> contigs; deque<ContigInfo> contig_infos; hash_graph.Assemble(contigs, contig_infos); hash_graph.clear(); { HashGraph tmp_hash_graph; tmp_hash_graph.swap(hash_graph); } ContigGraph contig_graph(kmer_size, contigs, contig_infos); contigs.clear(); contig_infos.clear(); contig_graph.RemoveDeadEnd(option.min_contig); int bubble = contig_graph.RemoveBubble(); cout << "merge bubble " << bubble << endl; contig_graph.MergeSimilarPath(); if (!option.is_no_coverage) contig_graph.RemoveLocalLowCoverage(min_cover, option.min_contig, 0.1); contig_graph.SortVertices(); contig_graph.GetContigs(contigs, contig_infos); WriteSequence(option.graph_file(kmer_size), contigs); contigs.clear(); contig_infos.clear(); if (!option.is_no_coverage) { double ratio = (kmer_size < option.maxk) ? 0.5 : 0.2; if (ratio < 2.0 / expected_coverage) ratio = 2.0 / expected_coverage; contig_graph.IterateLocalCoverage(option.min_contig, ratio, min_cover, 1e100, 1.1); contig_graph.MergeSimilarPath(); } deque<Sequence> multi_contigs; deque<ContigInfo> multi_contig_infos; contig_graph.SortVertices(); contig_graph.GetContigs(multi_contigs, multi_contig_infos); PrintN50(multi_contigs); //WriteSequence(option.contig_file(kmer_size), multi_contigs); WriteContig(option.contig_file(kmer_size), multi_contigs, multi_contig_infos, FormatString("contig-%d", kmer_size)); //WriteContigInfo(option.contig_info_file(kmer_size), multi_contig_infos); }
void Assemble(HashGraph &hash_graph) { cout << "kmers " << hash_graph.num_vertices() << " "<< hash_graph.num_edges() << endl; int kmer_size = hash_graph.kmer_size(); double min_cover = max(1, (kmer_size == option.mink ? option.min_count : option.min_support)); Histgram<int> hist = hash_graph.coverage_histgram(); //double expected_coverage = hist.mean(); deque<Sequence> contigs; deque<ContigInfo> contig_infos; hash_graph.Assemble(contigs, contig_infos); hash_graph.clear(); { HashGraph tmp_hash_graph; tmp_hash_graph.swap(hash_graph); } ContigGraph contig_graph(kmer_size, contigs, contig_infos); contigs.clear(); contig_infos.clear(); if (!option.is_no_coverage) { contig_graph.RemoveStandAlone(kmer_size); int bubble = contig_graph.RemoveBubble(); cout << "merge bubble " << bubble << endl; contig_graph.RemoveLocalLowCoverage(min_cover, option.min_contig, 0.1); } contig_graph.SortVertices(); contig_graph.GetContigs(contigs, contig_infos); WriteSequence(option.graph_file(kmer_size), contigs); contigs.clear(); contig_infos.clear(); if (!option.is_no_coverage) { double ratio = 0.25; deque<Sequence> multi_contigs; deque<ContigInfo> multi_contig_infos; contig_graph.GetContigs(multi_contigs, multi_contig_infos); PrintN50(multi_contigs); contig_graph.Trim(10); contig_graph.MergeSimilarPath(); contig_graph.GetContigs(multi_contigs, multi_contig_infos); contig_graph.InitializeTable(); contig_graph.IterateComponentCoverage2(option.min_contig, ratio, min_cover, 1e100, 1.1, max_component_size); contig_graph.GetContigs(multi_contigs, multi_contig_infos); contig_graph.Trim(10); contig_graph.Prune(kmer_size); contig_graph.GetContigs(multi_contigs, multi_contig_infos); contig_graph.MergeSimilarPath(); } deque<Sequence> multi_contigs; deque<ContigInfo> multi_contig_infos; contig_graph.SortVertices(); contig_graph.GetContigs(multi_contigs, multi_contig_infos); PrintN50(multi_contigs); WriteSequence(option.contig_file(kmer_size), multi_contigs); WriteContigInfo(option.contig_info_file(kmer_size), multi_contig_infos); deque<Sequence> transcripts; FindIsoforms(contig_graph, transcripts); int index = 0; for (unsigned i = 0; i < transcripts.size(); ++i) { if (transcripts[i].size() >= 300) transcripts[index++] = transcripts[i]; } transcripts.resize(index); PrintN50(transcripts); WriteSequence(option.transcript_file(kmer_size), transcripts, FormatString("transcript-%d", kmer_size)); }