Ejemplo n.º 1
0
static void _test_add_paths()
{
  test_status("Testing adding paths in generate_paths.c and gpath_fetch()");

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  size_t kmer_size = 11, ncols = 1;

  db_graph_alloc(&graph, kmer_size, ncols, ncols, 1024,
                 DBG_ALLOC_EDGES | DBG_ALLOC_COVGS |
                 DBG_ALLOC_BKTLOCKS | DBG_ALLOC_NODE_IN_COL);

  // Create a path store that tracks path counts
  gpath_store_alloc(&graph.gpstore,
                    graph.num_of_cols, graph.ht.capacity,
                    0, ONE_MEGABYTE, true, false);

  // Create path hash table for fast lookup
  gpath_hash_alloc(&graph.gphash, &graph.gpstore, ONE_MEGABYTE);

  build_graph_from_str_mt(&graph, 0, seq0, strlen(seq0));
  build_graph_from_str_mt(&graph, 0, seq1, strlen(seq1));
  build_graph_from_str_mt(&graph, 0, seq2, strlen(seq2));
  build_graph_from_str_mt(&graph, 0, seq3, strlen(seq3));

  // Set up alignment correction params
  CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0,
                            .frag_len_min = 0, .frag_len_max = 0,
                            .one_way_gap_traverse = true, .use_end_check = true,
                            .max_context = 10,
                            .gap_variance = 0.1, .gap_wiggle = 5};

  all_tests_add_paths(&graph, seq0, params, 5, 5); // path lens: 3+3+2+2+2
  all_tests_add_paths(&graph, seq1, params, 5, 2); // path lens: 3+3+2+2+2
  all_tests_add_paths(&graph, seq2, params, 3, 2); // path lens: 1+1+1
  all_tests_add_paths(&graph, seq3, params, 2, 1); // path lens: 1+1

  // Test path store
  gpath_checks_all_paths(&graph, 1); // use one thread

  // Test path content
  _check_node_paths(kmerA,  kmerApaths,  NPATHS_A,  0, &graph);
  _check_node_paths(kmerB,  kmerBpaths,  NPATHS_B,  0, &graph);
  _check_node_paths(kmerAB, kmerABpaths, NPATHS_AB, 0, &graph);
  _check_node_paths(kmerC,  kmerCpaths,  NPATHS_C,  0, &graph);
  _check_node_paths(kmerG,  kmerGpaths,  NPATHS_G,  0, &graph);
  _check_node_paths(kmerF,  kmerFpaths,  NPATHS_F,  0, &graph);
  _check_node_paths(kmerE,  kmerEpaths,  NPATHS_E,  0, &graph);
  _check_node_paths(kmerD,  kmerDpaths,  NPATHS_D,  0, &graph);
  _check_node_paths(kmerDEF,kmerDEFpaths,NPATHS_DEF,0, &graph);
  _check_node_paths(kmerDE, kmerDEpaths, NPATHS_DE, 0, &graph);

  db_graph_dealloc(&graph);
}

void test_paths()
{
  _test_add_paths();
}
Ejemplo n.º 2
0
void _construct_graph_with_paths(dBGraph *graph,
                                 size_t kmer_size, size_t ncols,
                                 char **seqs, size_t nseqs,
                                 CorrectAlnParam path_params)
{
  size_t i;
  db_graph_alloc(graph, kmer_size, ncols, ncols, 1024);

  // Graph data
  graph->bktlocks = ctx_calloc(roundup_bits2bytes(graph->ht.num_of_buckets), 1);
  graph->col_edges = ctx_calloc(graph->ht.capacity * ncols, sizeof(Edges));
  graph->col_covgs = ctx_calloc(graph->ht.capacity * ncols, sizeof(Covg));
  graph->node_in_cols = ctx_calloc(roundup_bits2bytes(graph->ht.capacity) * ncols, 1);

  // Path data
  path_store_alloc(&graph->pstore, 1024, true, graph->ht.capacity, ncols);
  graph->pstore.kmer_locks = ctx_calloc(roundup_bits2bytes(graph->ht.capacity), 1);

  // Build graph
  for(i = 0; i < nseqs; i++)
    build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i]));

  graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1);

  GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, graph, NULL);

  for(i = 0; i < nseqs; i++)
    gen_paths_from_str_mt(gen_path_wrkr, seqs[i], path_params);

  gen_paths_workers_dealloc(gen_path_wrkr, 1);
}
Ejemplo n.º 3
0
void test_supernode()
{
  test_status("testing supernode_find()...");

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  size_t kmer_size = 19, ncols = 1;

  db_graph_alloc(&graph, kmer_size, ncols, ncols, 1024,
                 DBG_ALLOC_EDGES | DBG_ALLOC_COVGS | DBG_ALLOC_BKTLOCKS);

  #define NSEQ 7

  const char *seq[NSEQ]
   = {"AGAGAGAGAGAGAGAGAGAGAGAG",
      "AAAAAAAAAAAAAAAAAAAAAAAAAA",
      "ATATATATATATATATATATATATATAT",
      "CGTTCGCGCATGGCCCACG",
      "GAACCAATCGGTCGACTGT",
      "CCCCGCAAAGTCCACTTAGTGTAAGGTACAAATTCTGCAGAGTTGCTGGATCAGCGATAC",
      "TCAATCCGATAGCAACCCGGTCCAA""TCAATCCGATAGCAACCCGGTCCAA"};

  const char *ans[NSEQ]
   = {"AGAGAGAGAGAGAGAGAGAG", // key AGAGAGAGAGAGAGAGAGA < CTCTCTCTCTCTCTCTCTC
      "AAAAAAAAAAAAAAAAAAA",
      "ATATATATATATATATATA",
      "CGTGGGCCATGCGCGAACG",
      "ACAGTCGACCGATTGGTTC",
      "CCCCGCAAAGTCCACTTAGTGTAAGGTACAAATTCTGCAGAGTTGCTGGATCAGCGATAC",
      "AACCCGGTCCAATCAATCCGATAGCAACCCGGTCCAATCAATC"};

  // Load all seq into colour 0
  size_t i;
  for(i = 0; i < NSEQ; i++)
    build_graph_from_str_mt(&graph, 0, seq[i], strlen(seq[i]), false);

  pull_out_supernodes(seq, ans, NSEQ, &graph);

  db_graph_dealloc(&graph);
}
Ejemplo n.º 4
0
void all_tests_add_paths_multi(dBGraph *graph, const char **seqs, size_t nseqs,
                               CorrectAlnParam params,
                               int exp_npaths, int exp_nkmers)
{
  size_t npaths = graph->gpstore.num_paths;
  size_t nkmers = graph->gpstore.num_kmers_with_paths;

  size_t i, nworkers = 1;
  GenPathWorker *wrkrs = gen_paths_workers_alloc(nworkers, graph);

  // Set up asyncio input data
  AsyncIOInput io = {.file1 = NULL, .file2 = NULL,
                     .fq_offset = 0, .interleaved = false};

  CorrectAlnInput task = {.files = io, .fq_cutoff = 0, .hp_cutoff = 0,
                          .matedir = READPAIR_FR, .crt_params = params,
                          .out_base = NULL, .output = NULL};

  AsyncIOData iodata;
  asynciodata_alloc(&iodata);
  seq_read_reset(&iodata.r2);
  iodata.fq_offset1 = iodata.fq_offset2 = 0;
  iodata.ptr = NULL;

  // Add paths
  for(i = 0; i < nseqs; i++) {
    seq_read_set(&iodata.r1, seqs[i]);
    gen_paths_worker_seq(wrkrs, &iodata, &task);
  }

  asynciodata_dealloc(&iodata);
  gen_paths_workers_dealloc(wrkrs, nworkers);

  // Check we added the right number of paths
  if(exp_npaths >= 0) {
    TASSERT2(graph->gpstore.num_paths == npaths + (size_t)exp_npaths, "%zu %zu %zu",
             (size_t)graph->gpstore.num_paths, (size_t)npaths, (size_t)exp_npaths);
  }

  if(exp_nkmers >= 0) {
    TASSERT(graph->gpstore.num_kmers_with_paths == nkmers + (size_t)exp_nkmers);
  }
}

void all_tests_add_paths(dBGraph *graph, const char *seq,
                         CorrectAlnParam params,
                         int exp_npaths, int exp_nkmers)
{
  all_tests_add_paths_multi(graph, &seq, 1, params, exp_npaths, exp_nkmers);
}

void all_tests_construct_graph(dBGraph *graph,
                               size_t kmer_size, size_t ncols,
                               const char **seqs, size_t nseqs,
                               CorrectAlnParam path_params)
{
  size_t i;
  db_graph_alloc(graph, kmer_size, ncols, ncols, 1024,
                 DBG_ALLOC_EDGES | DBG_ALLOC_COVGS |
                 DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  // Path data
  gpath_store_alloc(&graph->gpstore, ncols, graph->ht.capacity,
                    0, ONE_MEGABYTE, true, false);

  // Don't use links to add new links
  gpath_store_split_read_write(&graph->gpstore);

  // Allocate path hash table just in case
  gpath_hash_alloc(&graph->gphash, &graph->gpstore, ONE_MEGABYTE);

  // Build graph
  for(i = 0; i < nseqs; i++)
    build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i]), false);

  gpath_store_merge_read_write(&graph->gpstore);

  graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1);

  all_tests_add_paths_multi(graph, seqs, nseqs, path_params, -1, -1);
}
Ejemplo n.º 5
0
void test_graph_crawler()
{
  test_status("Testing graph crawler...");

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  const size_t kmer_size = 11, ncols = 3;

  db_graph_alloc(&graph, kmer_size, ncols, 1, 2048,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  char graphseq[3][77] =
//           <               X                 X              X...............
{"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC",
 "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA",
 "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"};

  build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0]));
  build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1]));
  build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2]));

  // Crawl graph
  GraphCrawler crawler;
  graph_crawler_alloc(&crawler, &graph);

  dBNode node = db_graph_find_str(&graph, graphseq[0]);
  dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1);
  TASSERT(node.key != HASH_NOT_FOUND);
  TASSERT(next_node.key != HASH_NOT_FOUND);

  BinaryKmer bkey = db_node_get_bkmer(&graph, node.key);
  Edges edges = db_node_get_edges(&graph, node.key, 0);

  dBNode next_nodes[4];
  Nucleotide next_nucs[4];
  size_t i, p, num_next, next_idx;

  num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges,
                                 next_nodes, next_nucs);

  next_idx = 0;
  while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node))
    next_idx++;

  TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node));

  // Crawl in all colours
  graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next,
                      NULL, graph.num_of_cols, NULL, NULL, NULL);

  TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths);

  // Fetch paths
  dBNodeBuffer nbuf;
  db_node_buf_alloc(&nbuf, 16);
  StrBuf sbuf;
  strbuf_alloc(&sbuf, 128);

  for(p = 0; p < crawler.num_paths; p++) {
    db_node_buf_reset(&nbuf);
    graph_crawler_get_path_nodes(&crawler, p, &nbuf);
    strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size);
    sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b);
    for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {}
    TASSERT2(i < 3, "seq: %s", sbuf.b);
    TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end);
    TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len);
  }

  strbuf_dealloc(&sbuf);
  db_node_buf_dealloc(&nbuf);

  graph_crawler_dealloc(&crawler);

  db_graph_dealloc(&graph);
}
Ejemplo n.º 6
0
static void test_repeat_loop()
{
  TASSERT(sizeof(FollowPath) == 20);

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  size_t kmer_size = 11, ncols = 1;

  // Set up alignment correction params
  CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0,
                            .ins_gap_min = 0, .ins_gap_max = 0,
                            .one_way_gap_traverse = true, .use_end_check = true,
                            .max_context = 10,
                            .gap_variance = 0.1, .gap_wiggle = 5};

  // Sequence with repeat
  char seq[] = "ATTTGGAACTCCGGA"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "CGTCAGGAGCTAACT";

  char p0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT";
  char p1[] = "GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT";

  // Allocate graph, but don't add any sequence
  _construct_graph_with_paths(&graph, kmer_size, ncols, NULL, 0, params);

  GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, &graph, NULL);

  GraphWalker gwlk;
  RepeatWalker rptwlk;
  graph_walker_alloc(&gwlk);
  rpt_walker_alloc(&rptwlk, graph.ht.capacity, 12);

  dBNodeBuffer nbuf;
  db_node_buf_alloc(&nbuf, 1024);

  // Construct graph but no paths
  build_graph_from_str_mt(&graph, 0, seq, strlen(seq));
  TASSERT2(graph.ht.num_kmers == 15+12+15, "%zu", (size_t)graph.ht.num_kmers);

  // Find first node in sequence
  dBNode node0 = db_graph_find_str(&graph, seq);
  TASSERT(node0.key != HASH_NOT_FOUND);

  // 1) With no paths
  char ans0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT";
  test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+2, ans0);

  // 2) Add small paths - produces collapsed down seq with two copy repeat
  gen_paths_from_str_mt(gen_path_wrkr, p0, params);
  gen_paths_from_str_mt(gen_path_wrkr, p1, params);
  char ans1[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT";
  test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+12+12+5, ans1);

  // 3) Add long paths
  gen_paths_from_str_mt(gen_path_wrkr, seq, params);
  test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, strlen(seq)+1-kmer_size, seq);

  graph_walker_dealloc(&gwlk);
  rpt_walker_dealloc(&rptwlk);
  db_node_buf_dealloc(&nbuf);
  gen_paths_workers_dealloc(gen_path_wrkr, 1);
  db_graph_dealloc(&graph);
}

void test_repeat_walker()
{
  test_status("Testing repeat_walker.h");
  test_repeat_loop();
}
Ejemplo n.º 7
0
// Load each sequence into a separate colour
static void test_bubbles(dBGraph *graph, const char **seqs, size_t nseqs,
                         const char *flank5p, const char *flank3p,
                         const char **alleles, size_t nalleles)
{
  db_graph_reset(graph);

  TASSERT(graph->num_of_cols >= nseqs);

  size_t i;
  for(i = 0; i < nseqs; i++)
    build_graph_from_str_mt(graph, i, seqs[i], strlen(seqs[i]), false);

  graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1);

  StrBuf sbuf;
  dBNodeBuffer nbuf;
  strbuf_alloc(&sbuf, 128);
  db_node_buf_alloc(&nbuf, 128);

  BubbleCallingPrefs prefs = {.max_allele_len = 100, .max_flank_len = 100,
                              .haploid_cols = NULL, .nhaploid_cols = 0,
                              .remove_serial_bubbles = true};

  BubbleCaller *caller = bubble_callers_new(1, &prefs, NULL, graph);

  _call_bubble(caller, flank5p, flank3p, alleles, nalleles, &nbuf, &sbuf);

  strbuf_dealloc(&sbuf);
  db_node_buf_dealloc(&nbuf);
  bubble_callers_destroy(caller, 1);
}

void test_bubble_caller()
{
  test_status("Testing bubble calling...");

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  const size_t kmer_size = 11, ncols = 3;

  // Create graph
  db_graph_alloc(&graph, kmer_size, ncols, 1, 2000,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  //   mutations:                      x
  const char *seqs0[] = {"AGGGATAAAACTCTGTACTGGATCTCCCT",
                         "AGGGATAAAACTCTcTACTGGATCTCCCT"};
  const char flank5p0[] = "AGGGATAAAACTCT";
  const char flank3p0[] = "TACTGGATCTCCCT";
  const char *alleles0[] = {"ATAAAACTCTGTACTGGATCT", "ATAAAACTCTcTACTGGATCT"};

  test_bubbles(&graph, seqs0, 2, flank5p0, flank3p0, alleles0, 2);

  //   mutations:                     x                  y
  const char *seqs1[] = {"CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA",
                         "CCCGTAGGTAAGtGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA",
                         "CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACtTTGGGACACGAGTTGATA"};

  // forwards
  const char flank5p1a[] = "CCCGTAGGTAAG";
  const char flank3p1a[] = "GCGTTAGTGCAAGGCCAC";
  const char *alleles1a[] = {"CGTAGGTAAGGGCGTTAGTGC", "CGTAGGTAAGtGCGTTAGTGC"};

  const char flank5p1b[] = "GCGTTAGTGCAAGGCCAC";
  const char flank3p1b[] = "TTGGGACACGAGTTGATA";
  const char *alleles1b[] = {"GCAAGGCCACATTGGGACACG", "GCAAGGCCACtTTGGGACACG"};

  test_bubbles(&graph, seqs1, 3, flank5p1a, flank3p1a, alleles1a, 2);
  test_bubbles(&graph, seqs1, 3, flank5p1b, flank3p1b, alleles1b, 2);

  // reverse
  // mutations:        y                  x
  // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCCCTTACCTACGGG
  // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCaCTTACCTACGGG
  // TATCAACTCGTGTCCCAAaGTGGCCTTGCACTAACGCCCTTACCTACGGG
  //
  const char flank5p1c[] = "GTGGCCTTGCACTAACGC";
  const char flank3p1c[] = "CTTACCTACGGG";
  const char *alleles1c[] = {"GCACTAACGCCCTTACCTACG", "GCACTAACGCaCTTACCTACG"};

  const char flank5p1d[] = "TATCAACTCGTGTCCCAA";
  const char flank3p1d[] = "GTGGCCTTGCACTAACGC";
  const char *alleles1d[] = {"CGTGTCCCAATGTGGCCTTGC", "CGTGTCCCAAaGTGGCCTTGC"};

  test_bubbles(&graph, seqs1, 3, flank5p1c, flank3p1c, alleles1c, 2);
  test_bubbles(&graph, seqs1, 3, flank5p1d, flank3p1d, alleles1d, 2);

  db_graph_dealloc(&graph);
}