示例#1
0
int main(int argc, char **argv)
{
  init_rand();
  setup_gsl_dgen();

  dgen_parse_cmdline(argc, argv);

  int i, j, h, p, k, c;
  
  int cons_cap, num_cons = -1, root_cap, num_root = -1;
  char **cons_seqs, **root_seqs;

  int internal_node_index = 0, leaf_node_index = M;
  int max_internal_node_index = 2000000;
  int max_leaf_node_index = 2000000;
  int sum_seen_or_unseen = 0;

  int ploidy, codon_sequence_length = -1, n_genes, total_n_HLA;
  Decimal mu;

  if(json_parameters_path == NULL) die("No .json file passed.");

  load_lengths_for_simulation_from_json(json_parameters_path, &kappa, &mu,
                                        &codon_sequence_length, &total_n_HLA, &ploidy, &n_genes);

  if(ploidy < 1) die("Ploidy is less than 1.");
  if(n_genes < 1) die("Number of genes is less than 1.");
  
  if(cons_path != NULL) {
    printf("Loading sequences to obtain consensus...\n");
    num_cons = load_seqs(cons_path, &cons_seqs, &cons_cap);
    assert(num_cons > 0);
    printf("Loaded %i sequences to determine consensus.\n", num_cons);
    
    codon_sequence_length = strlen(cons_seqs[0]);
    printf("Codon_sequence_length: %i\n",codon_sequence_length/3);
    
    if(codon_sequence_length % 3 != 0)
      die("Sequences contain partial codons [%i mod 3 != 0].", codon_sequence_length);

    for(c = 0; c < num_cons; c++) {
      if((int) strlen(cons_seqs[c]) != codon_sequence_length) {
        die("Sequences from which to derive the consensus sequence aren't all "
            "the same length.");
      }
    }
    codon_sequence_length = codon_sequence_length/3;
  }

  if(root_path != NULL) {
    printf("Loading sequences to obtain root...\n");
    num_root = load_seqs(root_path, &root_seqs, &root_cap);
    printf("Loaded %i sequences to determine root.\n", num_root);
    
    if(cons_path == NULL)
      die("Did not pass a file to find the consensus sequence.");
    
    if((int) (strlen(root_seqs[0])/3) != codon_sequence_length)
      die("Sequences used to determine the root are different lengths to those used for the consensus.");

    for(c = 0; c < num_root; c++) {
      if((int) strlen(root_seqs[c]) != 3*codon_sequence_length) {
        die("Sequences from which to derive the root sequence aren't all "
            "the same length.");
      }
    }
  }

  Decimal *internal_node_times = my_malloc(max_internal_node_index * sizeof(Decimal) , __FILE__, __LINE__);
  Decimal *leaf_node_times = my_malloc(max_leaf_node_index * sizeof(Decimal), __FILE__, __LINE__);
  int *seen_or_unseen = my_malloc(max_internal_node_index * sizeof(int), __FILE__, __LINE__);

  birth_death_simulation_backwards(max_internal_node_index, max_leaf_node_index,
                                   internal_node_times, 
                                   leaf_node_times,
                                   &internal_node_index, &leaf_node_index,
                                   seen_or_unseen,
                                   N, M, lambda, mu_tree, past_sampling);

  for(i = 0; i < internal_node_index; i++) sum_seen_or_unseen += seen_or_unseen[i];
  
  int total_nodes = (2 * leaf_node_index) - 1 + internal_node_index - sum_seen_or_unseen;
  Tree *tree = my_malloc((total_nodes+1) * sizeof(Tree), __FILE__, __LINE__);
  // Now malloc the memory that this points to.
  int *HLAs_in_tree = my_malloc((total_nodes+1) * ploidy * n_genes * sizeof(int), __FILE__, __LINE__);

  for(i = 0; i < total_nodes; i++) 
    tree[i].HLAs = &HLAs_in_tree[i * ploidy * n_genes];

  construct_birth_death_tree(leaf_node_index, internal_node_index,
                             leaf_node_times, internal_node_times,
                             M, seen_or_unseen,
                             tree);

  // Reverse the direction that time is measured in the tree.
  // DEV: Don't need to do this, waste of computation - sort.
  // DEV: The parent times are wrong when there are unseen nodes.
  for(i = 0; i < total_nodes; i++)
    tree[i].node_time = tree[total_nodes-1].node_time - tree[i].node_time;
   
  int root_node = tree[total_nodes-1].node;
  
  if(write_newick_tree_to_file == true) {
    write_newick_tree(newick_tree_data_file, tree, root_node, 1);
    fclose(newick_tree_data_file);
  }

  Decimal S_portion[NUM_CODONS];
  Decimal NS_portion[NUM_CODONS];

  for(c = 0; c < NUM_CODONS; c++) {
    S_portion[c] = kappa * beta_S[c] + beta_V[c];
    NS_portion[c] = kappa * alpha_S[c] + alpha_V[c];
  }
  
  int n_HLA[n_genes];

  printf("Total number of HLA types: %i.\n", total_n_HLA);

  Decimal HLA_prevalences[total_n_HLA];
  int wildtype_sequence[codon_sequence_length];
  Decimal *R = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__);
  Decimal *omega = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__);
  Decimal *reversion_selection = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__);

  memory_allocation(num_cons, num_root, codon_sequence_length,
                    max_internal_node_index, max_leaf_node_index, 
                    total_nodes, ploidy, n_genes, total_n_HLA,
                    leaf_node_index);

  int (*codon_sequence_matrix)[codon_sequence_length] = my_malloc(total_nodes *
                                                                  sizeof(int[codon_sequence_length]),
                                                                  __FILE__, __LINE__);
  Decimal (*HLA_selection_profiles)[codon_sequence_length] = my_malloc(total_n_HLA * sizeof(Decimal[codon_sequence_length]),
                                                                       __FILE__, __LINE__);

  load_parameters_for_simulation_from_json(json_parameters_path, codon_sequence_length,
                                           omega, R, reversion_selection, total_n_HLA,
                                           n_genes, n_HLA, HLA_prevalences,
                                           HLA_selection_profiles);
  
  Decimal sum_check;
  for(i = 0, k = 0; i < n_genes; i++) {
    sum_check = 0;
    for(h = 0; h < n_HLA[i]; h++, k++) {
      sum_check += HLA_prevalences[k];
    }
    if(sum_check > 1.00001 || sum_check < 0.9999) die("HLA prevalences for gene %i do not sum to 1\n", i+1);
  }
  
  if(cons_path != NULL) {
    printf("Mapping gaps to consensus...\n");
    // Set the consensus sequence - the consensus of the optional sequence file 
    // that is passed.
    char wildtype_sequence_dummy[3*codon_sequence_length+1];
    generate_consensus(cons_seqs, num_cons, 3*codon_sequence_length, wildtype_sequence_dummy);
    printf("Wildtype sequence:\n%s\n", wildtype_sequence_dummy);
    // By default, set the root as the wildtype sequence.
    for(i = 0; i < codon_sequence_length; i++)
      wildtype_sequence[i] = (int) amino_to_code(wildtype_sequence_dummy+i*3);

    if(root_path == NULL) {
      for(i = 0; i < codon_sequence_length; i++)
        codon_sequence_matrix[root_node][i] = wildtype_sequence[i];
    } else {
      printf("Mapping gaps to root...\n");
      char root_sequence_dummy[3*codon_sequence_length+1];
      generate_consensus(root_seqs, num_root, 3*codon_sequence_length, root_sequence_dummy);
      printf("Root sequence:\n%s\n", root_sequence_dummy);
      
      for(i = 0; i < codon_sequence_length; i++)
        codon_sequence_matrix[root_node][i] = (int) amino_to_code(root_sequence_dummy+i*3);
      printf("Number of root sequences: %i.\n", num_root);
      for(c = 0; c < num_root; c++) free(root_seqs[c]);
      free(root_seqs);
    }
    printf("Number of consensus sequences: %i.\n", num_cons);
    for(c = 0; c < num_cons; c++) free(cons_seqs[c]);
    free(cons_seqs);
  
  } else {
    for(i = 0; i < codon_sequence_length; i++) {
      // Sample the root sequence according to the HIV codon usage information.
      codon_sequence_matrix[root_node][i] = discrete_sampling_dist(NUM_CODONS, prior_C1);
      // As default, set the root node to the consensus sequence.  
      wildtype_sequence[i] = codon_sequence_matrix[root_node][i];
    }
  }
  
  // No matter what is read in, there is no recombination simulated - so make sure it's set to 0.
  for(i = 0; i < codon_sequence_length; i++) R[i] = 0;

  write_summary_json(json_summary_file,
                     mu, codon_sequence_length, ploidy,
                     n_genes, n_HLA, total_n_HLA,
                     HLA_prevalences,
                     omega, R, reversion_selection,
                     HLA_selection_profiles);

  free(R);
  
  fprintf(simulated_root_file, ">root_sequence\n");
  for(i = 0; i < codon_sequence_length; i++)
    fprintf(simulated_root_file, "%s", code_to_char(codon_sequence_matrix[root_node][i]));
  fprintf(simulated_root_file, "\n");

  int root_HLA[ploidy * n_genes];
  int cumulative_n_HLA = 0;

  for(i = 0, k = 0; i < n_genes; i++) {
    for(p = 0; p < ploidy; p++, k++) {
      root_HLA[k] = cumulative_n_HLA + 
                    discrete_sampling_dist(n_HLA[i], &HLA_prevalences[cumulative_n_HLA]);
      tree[root_node].HLAs[k] = root_HLA[k];
    }
    cumulative_n_HLA = cumulative_n_HLA + n_HLA[i];
  }

  printf("Passing HLA information...\n");
  pass_HLA(ploidy, n_genes, root_node,
                           tree, leaf_node_index, total_n_HLA,
                           n_HLA, HLA_prevalences);
  printf("Passed HLA information\n");
  
  // printf("Printing the tree\n");
  // for(i = 0; i < total_nodes; i++) {
  //   printf("%i %i %i "DECPRINT" %i", tree[i].node, tree[i].daughter_nodes[0],
  //          tree[i].daughter_nodes[1], tree[i].node_time,
  //          tree[i].seen_or_unseen);
  //   for(j = 0; j < (ploidy * n_genes); j++) {
  //     printf(" %i", tree[i].HLAs[j]);
  //   }
  //   printf("\n");
  // }

  if(write_tree_to_file == true) {
    write_tree(tree_data_file, tree, root_node, ploidy, n_genes);
    fclose(tree_data_file);
  }

  printf("Passing sequence information...\n");
  
  pass_codon_sequence_change(codon_sequence_length, ploidy,
                             n_genes, total_n_HLA, 
                             root_node, mu,
                             codon_sequence_matrix,
                             tree,
                             leaf_node_index,
                             S_portion, NS_portion,
                             HLA_selection_profiles,
                             wildtype_sequence,
                             omega, reversion_selection);

  printf("Passed sequence information\n"
         "Now generating .fasta files of reference and query sequences, and\n"
         "a .csv file of the HLA information associated to the query sequences.\n");

  if(num_queries < 0) {
    // Set the number of query sequences.
    num_queries = (int) (query_fraction * leaf_node_index);
    printf("Number of queries: %i.\n", num_queries);
  } else {
    printf("Number of queries: %i.\n", num_queries);
  }

  if(num_queries > leaf_node_index) die("Number of query sequences larger than the number of leaves");
  int *all_sequences = my_malloc(leaf_node_index * sizeof(int), __FILE__, __LINE__);
  int num_refs = leaf_node_index - num_queries;

  for(i = 0; i < leaf_node_index; i++) all_sequences[i] = i;

  save_simulated_ref_and_query_fasta(num_queries, num_refs, leaf_node_index,
                                     all_sequences, codon_sequence_length, codon_sequence_matrix,
                                     tree, ploidy, n_genes);

  // Now save the hla types to a .csv file.
  fprintf(hla_query_file, "\"\",");
  for(h = 0; h < total_n_HLA-1; h++) fprintf(hla_query_file, "\"%i\",", h+1);
  fprintf(hla_query_file, "\"%i\"\n", total_n_HLA);
  
  // Write the HLA types of the leaves to a file.
  int (*hla_types)[total_n_HLA] = my_malloc(leaf_node_index * sizeof(int[total_n_HLA]), __FILE__, __LINE__);

  for(i = 0; i < leaf_node_index; i++)
  {
    for(h = 0; h < total_n_HLA; h++)
      hla_types[i][h] = 0;
    for(j = 0; j < (n_genes * ploidy); j++)
      hla_types[i][tree[i].HLAs[j]] = 1;
  }

  // Write the query HLA types to a .csv file.
  for(i = num_refs; i < leaf_node_index; i++)
  {
    fprintf(hla_query_file,"\"simulated_seq_%i_HLA", all_sequences[i]+1);
    for(h = 0; h < (ploidy * n_genes); h++) fprintf(hla_query_file, "_%i", tree[all_sequences[i]].HLAs[h]);
    fprintf(hla_query_file, "\"");
    for(h = 0; h < total_n_HLA; h++) {
      fprintf(hla_query_file, ",%i", hla_types[all_sequences[i]][h]);
    }
    fprintf(hla_query_file, "\n");
  }

  free(hla_types); 
  free(internal_node_times);
  free(leaf_node_times);
  free(seen_or_unseen);
  free(codon_sequence_matrix);
  free(HLA_selection_profiles);
  free(all_sequences);
  free(omega);
  free(reversion_selection);

  free(tree[0].HLAs);
  free(tree);

  fclose(summary_file);
  fclose(json_summary_file);
  fclose(simulated_refs_file);
  fclose(simulated_root_file);
  fclose(simulated_queries_file);
  fclose(hla_query_file);

  clearup_gsl_dgen();
  return EXIT_SUCCESS;
}
示例#2
0
文件: main.cpp 项目: rvaser/spoa
int main(int argc, char** argv) {

    std::int8_t m = 5;
    std::int8_t n = -4;
    std::int8_t g = -8;
    std::int8_t e = -6;
    std::int8_t q = -10;
    std::int8_t c = -4;

    std::uint8_t algorithm = 0;
    std::uint8_t result = 0;

    std::string dot_path = "";

    char opt;
    while ((opt = getopt_long(argc, argv, "m:n:g:e:q:c:l:r:d:h", options, nullptr)) != -1) {
        switch (opt) {
            case 'm': m = atoi(optarg); break;
            case 'n': n = atoi(optarg); break;
            case 'g': g = atoi(optarg); break;
            case 'e': e = atoi(optarg); break;
            case 'q': q = atoi(optarg); break;
            case 'c': c = atoi(optarg); break;
            case 'l': algorithm = atoi(optarg); break;
            case 'r': result = atoi(optarg); break;
            case 'd': dot_path = optarg; break;
            case 'v': std::cout << version << std::endl; return 0;
            case 'h': help(); return 0;
            default: return 1;
        }
    }

    if (optind >= argc) {
        std::cerr << "[spoa::] error: missing input file!" << std::endl;
        help();
        return 1;
    }

    std::string sequences_path = argv[optind];

    auto is_suffix = [](const std::string& src, const std::string& suffix) -> bool {
        if (src.size() < suffix.size()) {
            return false;
        }
        return src.compare(src.size() - suffix.size(), suffix.size(), suffix) == 0;
    };

    std::unique_ptr<bioparser::Parser<spoa::Sequence>> sparser = nullptr;

    if (is_suffix(sequences_path, ".fasta") || is_suffix(sequences_path, ".fa") ||
        is_suffix(sequences_path, ".fasta.gz") || is_suffix(sequences_path, ".fa.gz")) {
        sparser = bioparser::createParser<bioparser::FastaParser, spoa::Sequence>(
            sequences_path);
    } else if (is_suffix(sequences_path, ".fastq") || is_suffix(sequences_path, ".fq") ||
        is_suffix(sequences_path, ".fastq.gz") || is_suffix(sequences_path, ".fq.gz")) {
        sparser = bioparser::createParser<bioparser::FastqParser, spoa::Sequence>(
            sequences_path);
    } else {
        std::cerr << "[spoa::] error: file " << sequences_path <<
            " has unsupported format extension (valid extensions: .fasta, "
            ".fasta.gz, .fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz)!" <<
            std::endl;
        return 1;
    }

    std::unique_ptr<spoa::AlignmentEngine> alignment_engine;
    try {
        alignment_engine = spoa::createAlignmentEngine(
            static_cast<spoa::AlignmentType>(algorithm), m, n, g, e, q, c);
    } catch(std::invalid_argument& exception) {
        std::cerr << exception.what() << std::endl;
        return 1;
    }

    auto graph = spoa::createGraph();

    std::vector<std::unique_ptr<spoa::Sequence>> sequences;
    sparser->parse(sequences, -1);

    std::size_t max_sequence_size = 0;
    for (const auto& it: sequences) {
        max_sequence_size = std::max(max_sequence_size, it->data().size());
    }
    alignment_engine->prealloc(max_sequence_size, 4);

    for (const auto& it: sequences) {
        auto alignment = alignment_engine->align(it->data(), graph);
        try {
            graph->add_alignment(alignment, it->data(), it->quality());
        } catch(std::invalid_argument& exception) {
            std::cerr << exception.what() << std::endl;
            return 1;
        }
    }

    if (result == 0 || result == 2) {
        std::string consensus = graph->generate_consensus();
        std::cout << "Consensus (" << consensus.size() << ")" << std::endl;
        std::cout << consensus << std::endl;
    }

    if (result == 1 || result == 2) {
        std::vector<std::string> msa;
        graph->generate_multiple_sequence_alignment(msa);
        std::cout << "Multiple sequence alignment" << std::endl;
        for (const auto& it: msa) {
            std::cout << it << std::endl;
        }
    }

    graph->print_dot(dot_path);

    return 0;
}