Esempio n. 1
0
int64_t InsertIterativeKmers(const HashGraph &old_hash_graph, const Sequence &seq, HashGraph &hash_graph, int kmer_count)
{
    int old_kmer_size = old_hash_graph.kmer_size();
    int new_kmer_size = hash_graph.kmer_size();

    Kmer old_kmer(old_kmer_size);
    Kmer new_kmer(new_kmer_size);
    int length = 0;
    int count = 0;
    int num_iterative_kmers = 0;
    for (uint32_t j = 0; j < seq.size(); ++j)
    {
        old_kmer.ShiftAppend(seq[j]);
        new_kmer.ShiftAppend(seq[j]);

        length = (seq[j] < 4) ? length + 1 : 0;

        count = (length >= old_kmer_size && old_hash_graph.FindVertex(old_kmer) != NULL) ? count+1 : 0;
        if (count >= new_kmer_size - old_kmer_size + 1)
        {
            ++num_iterative_kmers;
            HashGraphVertex *vertex = hash_graph.InsertVertex(new_kmer, kmer_count);
            HashGraphVertexAdaptor adaptor(vertex, new_kmer != vertex->kmer());
            if (length > new_kmer_size && seq[j-new_kmer_size] < 4)
                adaptor.in_edges().Add(3 - seq[j-new_kmer_size]);
            if (j+1 < seq.size() && seq[j+1] < 4)
                adaptor.out_edges().Add(seq[j+1]);
        }
    }

    return num_iterative_kmers;
}
Esempio n. 2
0
File: idba.cpp Progetto: binma/idba
void Assemble(HashGraph &hash_graph)
{
    cout << "kmers " << hash_graph.num_vertices() << " "<< hash_graph.num_edges() << endl;

    int kmer_size = hash_graph.kmer_size();
    double min_cover = max(1, (kmer_size == option.mink ? option.min_count : option.min_support));

    Histgram<int> hist = hash_graph.coverage_histgram();
    double expected_coverage = hist.mean();

    deque<Sequence> contigs;
    deque<ContigInfo> contig_infos;
    hash_graph.Assemble(contigs, contig_infos);
    hash_graph.clear();

    {
        HashGraph tmp_hash_graph;
        tmp_hash_graph.swap(hash_graph);
    }

    ContigGraph contig_graph(kmer_size, contigs, contig_infos);
    contigs.clear();
    contig_infos.clear();

    contig_graph.RemoveDeadEnd(option.min_contig);
    int bubble = contig_graph.RemoveBubble();
    cout << "merge bubble " << bubble << endl;

    contig_graph.MergeSimilarPath();

    if (!option.is_no_coverage)
        contig_graph.RemoveLocalLowCoverage(min_cover, option.min_contig, 0.1);

    contig_graph.SortVertices();
    contig_graph.GetContigs(contigs, contig_infos);
    WriteSequence(option.graph_file(kmer_size), contigs);
    contigs.clear();
    contig_infos.clear();

    if (!option.is_no_coverage)
    {
        double ratio = (kmer_size < option.maxk) ? 0.5 : 0.2;
        if (ratio < 2.0 / expected_coverage)
            ratio = 2.0 / expected_coverage;
        contig_graph.IterateLocalCoverage(option.min_contig, ratio, min_cover, 1e100, 1.1);
        contig_graph.MergeSimilarPath();
    }

    deque<Sequence> multi_contigs;
    deque<ContigInfo> multi_contig_infos;
    contig_graph.SortVertices();
    contig_graph.GetContigs(multi_contigs, multi_contig_infos);
    PrintN50(multi_contigs);
    //WriteSequence(option.contig_file(kmer_size), multi_contigs);
    WriteContig(option.contig_file(kmer_size), multi_contigs, multi_contig_infos, FormatString("contig-%d", kmer_size));
    //WriteContigInfo(option.contig_info_file(kmer_size), multi_contig_infos);
}
Esempio n. 3
0
void IterateHashGraph(AssemblyInfo &assembly_info, int new_kmer_size, int min_support, HashGraph &hash_graph, deque<Sequence> &old_contigs)
{
    int old_kmer_size = hash_graph.kmer_size();
    deque<ShortSequence> &reads = assembly_info.reads;
    deque<Sequence> &long_reads = assembly_info.long_reads;
    vector<bool> &read_flags = assembly_info.read_flags;
    vector<bool> &long_read_flags = assembly_info.long_read_flags;

#pragma omp parallel for schedule(static, 1)
    for (int64_t i = 0; i < (int64_t)old_contigs.size(); ++i)
        hash_graph.InsertUncountKmers(old_contigs[i]);
    hash_graph.AddAllEdges();

    deque<Sequence> contigs;
    hash_graph.Assemble(contigs);
    hash_graph.clear();

    uint64_t sum = 0;
    int d = new_kmer_size - old_kmer_size;
    for (unsigned i = 0; i < contigs.size(); ++i)
    {
        if ((int)contigs[i].size() - old_kmer_size + 1 >= 2*d + 2)
            sum += 2*d + 2;
        else if ((int)contigs[i].size() >= old_kmer_size)
            sum += contigs[i].size() - old_kmer_size + 1;
    }

    HashGraph old_hash_graph(old_kmer_size);
    old_hash_graph.reserve(sum);
#pragma omp parallel for schedule(static, 1)
    for (int64_t i = 0; i < (int64_t)contigs.size(); ++i)
    {
        Sequence seq;
        seq.Assign(contigs[i], 0, min(new_kmer_size, (int)contigs[i].size()));
        old_hash_graph.InsertKmers(seq);

        seq.Assign(contigs[i], max(0, (int)contigs[i].size() - new_kmer_size), min(new_kmer_size, (int)contigs[i].size()));
        old_hash_graph.InsertKmers(seq);
    }
    //cout << "old kmer " << old_hash_graph.num_vertices() << endl;

    hash_graph.set_kmer_size(new_kmer_size);
#pragma omp parallel for
    for (int64_t i = 0; i < (int64_t)reads.size(); ++i)
    {
        if (!read_flags[i])
            continue;

        Sequence seq(reads[i]);
        InsertIterativeKmers(old_hash_graph, seq, hash_graph);
    }

#pragma omp parallel for schedule(static, 1)
    for (int64_t i = 0; i < (int64_t)long_reads.size(); ++i)
    {
        if (!long_read_flags[i])
            continue;

        InsertIterativeKmers(old_hash_graph, long_reads[i], hash_graph);
    }
    
#pragma omp parallel for schedule(static, 1)
    for (int64_t i = 0; i < (int64_t)assembly_info.ref_contigs.size(); ++i)
        InsertIterativeKmers(old_hash_graph, assembly_info.ref_contigs[i], hash_graph);

    old_hash_graph.clear();
    {
        HashGraph tmp_hash_graph;
        tmp_hash_graph.swap(old_hash_graph);
    }

    hash_graph.RefreshVertices(min_support);

#pragma omp parallel for schedule(static, 1)
    for (int64_t i = 0; i < (int64_t)old_contigs.size(); ++i)
        hash_graph.InsertUncountKmers(old_contigs[i]);
    hash_graph.ClearCount();

    InsertExistKmers(assembly_info, hash_graph);
}
Esempio n. 4
0
void Assemble(HashGraph &hash_graph)
{
    cout << "kmers " << hash_graph.num_vertices() << " "<< hash_graph.num_edges() << endl;

    int kmer_size = hash_graph.kmer_size();
    double min_cover = max(1, (kmer_size == option.mink ? option.min_count : option.min_support));

    Histgram<int> hist = hash_graph.coverage_histgram();
    //double expected_coverage = hist.mean();

    deque<Sequence> contigs;
    deque<ContigInfo> contig_infos;
    hash_graph.Assemble(contigs, contig_infos);
    hash_graph.clear();

    {
        HashGraph tmp_hash_graph;
        tmp_hash_graph.swap(hash_graph);
    }

    ContigGraph contig_graph(kmer_size, contigs, contig_infos);
    contigs.clear();
    contig_infos.clear();

    if (!option.is_no_coverage)
    {
        contig_graph.RemoveStandAlone(kmer_size);

        int bubble = contig_graph.RemoveBubble();
        cout << "merge bubble " << bubble << endl;

        contig_graph.RemoveLocalLowCoverage(min_cover, option.min_contig, 0.1);
    }

    contig_graph.SortVertices();
    contig_graph.GetContigs(contigs, contig_infos);
    WriteSequence(option.graph_file(kmer_size), contigs);
    contigs.clear();
    contig_infos.clear();

    if (!option.is_no_coverage)
    {
        double ratio = 0.25;

        deque<Sequence> multi_contigs;
        deque<ContigInfo> multi_contig_infos;
        contig_graph.GetContigs(multi_contigs, multi_contig_infos);
        PrintN50(multi_contigs);

        contig_graph.Trim(10);
        contig_graph.MergeSimilarPath();
        contig_graph.GetContigs(multi_contigs, multi_contig_infos);

        contig_graph.InitializeTable();
        contig_graph.IterateComponentCoverage2(option.min_contig, ratio, min_cover, 1e100, 1.1, max_component_size);
        contig_graph.GetContigs(multi_contigs, multi_contig_infos);

        contig_graph.Trim(10);
        contig_graph.Prune(kmer_size);
        contig_graph.GetContigs(multi_contigs, multi_contig_infos);

        contig_graph.MergeSimilarPath();
    }

    deque<Sequence> multi_contigs;
    deque<ContigInfo> multi_contig_infos;
    contig_graph.SortVertices();
    contig_graph.GetContigs(multi_contigs, multi_contig_infos);
    PrintN50(multi_contigs);
    WriteSequence(option.contig_file(kmer_size), multi_contigs);
    WriteContigInfo(option.contig_info_file(kmer_size), multi_contig_infos);

    deque<Sequence> transcripts;

    FindIsoforms(contig_graph, transcripts);

    int index = 0;
    for (unsigned i = 0; i < transcripts.size(); ++i)
    {
        if (transcripts[i].size() >= 300)
            transcripts[index++] = transcripts[i];
    }
    transcripts.resize(index);

    PrintN50(transcripts);
    WriteSequence(option.transcript_file(kmer_size), transcripts, FormatString("transcript-%d", kmer_size));
}