int main(int argc, char **argv) { init_rand(); setup_gsl_dgen(); dgen_parse_cmdline(argc, argv); int i, j, h, p, k, c; int cons_cap, num_cons = -1, root_cap, num_root = -1; char **cons_seqs, **root_seqs; int internal_node_index = 0, leaf_node_index = M; int max_internal_node_index = 2000000; int max_leaf_node_index = 2000000; int sum_seen_or_unseen = 0; int ploidy, codon_sequence_length = -1, n_genes, total_n_HLA; Decimal mu; if(json_parameters_path == NULL) die("No .json file passed."); load_lengths_for_simulation_from_json(json_parameters_path, &kappa, &mu, &codon_sequence_length, &total_n_HLA, &ploidy, &n_genes); if(ploidy < 1) die("Ploidy is less than 1."); if(n_genes < 1) die("Number of genes is less than 1."); if(cons_path != NULL) { printf("Loading sequences to obtain consensus...\n"); num_cons = load_seqs(cons_path, &cons_seqs, &cons_cap); assert(num_cons > 0); printf("Loaded %i sequences to determine consensus.\n", num_cons); codon_sequence_length = strlen(cons_seqs[0]); printf("Codon_sequence_length: %i\n",codon_sequence_length/3); if(codon_sequence_length % 3 != 0) die("Sequences contain partial codons [%i mod 3 != 0].", codon_sequence_length); for(c = 0; c < num_cons; c++) { if((int) strlen(cons_seqs[c]) != codon_sequence_length) { die("Sequences from which to derive the consensus sequence aren't all " "the same length."); } } codon_sequence_length = codon_sequence_length/3; } if(root_path != NULL) { printf("Loading sequences to obtain root...\n"); num_root = load_seqs(root_path, &root_seqs, &root_cap); printf("Loaded %i sequences to determine root.\n", num_root); if(cons_path == NULL) die("Did not pass a file to find the consensus sequence."); if((int) (strlen(root_seqs[0])/3) != codon_sequence_length) die("Sequences used to determine the root are different lengths to those used for the consensus."); for(c = 0; c < num_root; c++) { if((int) strlen(root_seqs[c]) != 3*codon_sequence_length) { die("Sequences from which to derive the root sequence aren't all " "the same length."); } } } Decimal *internal_node_times = my_malloc(max_internal_node_index * sizeof(Decimal) , __FILE__, __LINE__); Decimal *leaf_node_times = my_malloc(max_leaf_node_index * sizeof(Decimal), __FILE__, __LINE__); int *seen_or_unseen = my_malloc(max_internal_node_index * sizeof(int), __FILE__, __LINE__); birth_death_simulation_backwards(max_internal_node_index, max_leaf_node_index, internal_node_times, leaf_node_times, &internal_node_index, &leaf_node_index, seen_or_unseen, N, M, lambda, mu_tree, past_sampling); for(i = 0; i < internal_node_index; i++) sum_seen_or_unseen += seen_or_unseen[i]; int total_nodes = (2 * leaf_node_index) - 1 + internal_node_index - sum_seen_or_unseen; Tree *tree = my_malloc((total_nodes+1) * sizeof(Tree), __FILE__, __LINE__); // Now malloc the memory that this points to. int *HLAs_in_tree = my_malloc((total_nodes+1) * ploidy * n_genes * sizeof(int), __FILE__, __LINE__); for(i = 0; i < total_nodes; i++) tree[i].HLAs = &HLAs_in_tree[i * ploidy * n_genes]; construct_birth_death_tree(leaf_node_index, internal_node_index, leaf_node_times, internal_node_times, M, seen_or_unseen, tree); // Reverse the direction that time is measured in the tree. // DEV: Don't need to do this, waste of computation - sort. // DEV: The parent times are wrong when there are unseen nodes. for(i = 0; i < total_nodes; i++) tree[i].node_time = tree[total_nodes-1].node_time - tree[i].node_time; int root_node = tree[total_nodes-1].node; if(write_newick_tree_to_file == true) { write_newick_tree(newick_tree_data_file, tree, root_node, 1); fclose(newick_tree_data_file); } Decimal S_portion[NUM_CODONS]; Decimal NS_portion[NUM_CODONS]; for(c = 0; c < NUM_CODONS; c++) { S_portion[c] = kappa * beta_S[c] + beta_V[c]; NS_portion[c] = kappa * alpha_S[c] + alpha_V[c]; } int n_HLA[n_genes]; printf("Total number of HLA types: %i.\n", total_n_HLA); Decimal HLA_prevalences[total_n_HLA]; int wildtype_sequence[codon_sequence_length]; Decimal *R = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__); Decimal *omega = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__); Decimal *reversion_selection = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__); memory_allocation(num_cons, num_root, codon_sequence_length, max_internal_node_index, max_leaf_node_index, total_nodes, ploidy, n_genes, total_n_HLA, leaf_node_index); int (*codon_sequence_matrix)[codon_sequence_length] = my_malloc(total_nodes * sizeof(int[codon_sequence_length]), __FILE__, __LINE__); Decimal (*HLA_selection_profiles)[codon_sequence_length] = my_malloc(total_n_HLA * sizeof(Decimal[codon_sequence_length]), __FILE__, __LINE__); load_parameters_for_simulation_from_json(json_parameters_path, codon_sequence_length, omega, R, reversion_selection, total_n_HLA, n_genes, n_HLA, HLA_prevalences, HLA_selection_profiles); Decimal sum_check; for(i = 0, k = 0; i < n_genes; i++) { sum_check = 0; for(h = 0; h < n_HLA[i]; h++, k++) { sum_check += HLA_prevalences[k]; } if(sum_check > 1.00001 || sum_check < 0.9999) die("HLA prevalences for gene %i do not sum to 1\n", i+1); } if(cons_path != NULL) { printf("Mapping gaps to consensus...\n"); // Set the consensus sequence - the consensus of the optional sequence file // that is passed. char wildtype_sequence_dummy[3*codon_sequence_length+1]; generate_consensus(cons_seqs, num_cons, 3*codon_sequence_length, wildtype_sequence_dummy); printf("Wildtype sequence:\n%s\n", wildtype_sequence_dummy); // By default, set the root as the wildtype sequence. for(i = 0; i < codon_sequence_length; i++) wildtype_sequence[i] = (int) amino_to_code(wildtype_sequence_dummy+i*3); if(root_path == NULL) { for(i = 0; i < codon_sequence_length; i++) codon_sequence_matrix[root_node][i] = wildtype_sequence[i]; } else { printf("Mapping gaps to root...\n"); char root_sequence_dummy[3*codon_sequence_length+1]; generate_consensus(root_seqs, num_root, 3*codon_sequence_length, root_sequence_dummy); printf("Root sequence:\n%s\n", root_sequence_dummy); for(i = 0; i < codon_sequence_length; i++) codon_sequence_matrix[root_node][i] = (int) amino_to_code(root_sequence_dummy+i*3); printf("Number of root sequences: %i.\n", num_root); for(c = 0; c < num_root; c++) free(root_seqs[c]); free(root_seqs); } printf("Number of consensus sequences: %i.\n", num_cons); for(c = 0; c < num_cons; c++) free(cons_seqs[c]); free(cons_seqs); } else { for(i = 0; i < codon_sequence_length; i++) { // Sample the root sequence according to the HIV codon usage information. codon_sequence_matrix[root_node][i] = discrete_sampling_dist(NUM_CODONS, prior_C1); // As default, set the root node to the consensus sequence. wildtype_sequence[i] = codon_sequence_matrix[root_node][i]; } } // No matter what is read in, there is no recombination simulated - so make sure it's set to 0. for(i = 0; i < codon_sequence_length; i++) R[i] = 0; write_summary_json(json_summary_file, mu, codon_sequence_length, ploidy, n_genes, n_HLA, total_n_HLA, HLA_prevalences, omega, R, reversion_selection, HLA_selection_profiles); free(R); fprintf(simulated_root_file, ">root_sequence\n"); for(i = 0; i < codon_sequence_length; i++) fprintf(simulated_root_file, "%s", code_to_char(codon_sequence_matrix[root_node][i])); fprintf(simulated_root_file, "\n"); int root_HLA[ploidy * n_genes]; int cumulative_n_HLA = 0; for(i = 0, k = 0; i < n_genes; i++) { for(p = 0; p < ploidy; p++, k++) { root_HLA[k] = cumulative_n_HLA + discrete_sampling_dist(n_HLA[i], &HLA_prevalences[cumulative_n_HLA]); tree[root_node].HLAs[k] = root_HLA[k]; } cumulative_n_HLA = cumulative_n_HLA + n_HLA[i]; } printf("Passing HLA information...\n"); pass_HLA(ploidy, n_genes, root_node, tree, leaf_node_index, total_n_HLA, n_HLA, HLA_prevalences); printf("Passed HLA information\n"); // printf("Printing the tree\n"); // for(i = 0; i < total_nodes; i++) { // printf("%i %i %i "DECPRINT" %i", tree[i].node, tree[i].daughter_nodes[0], // tree[i].daughter_nodes[1], tree[i].node_time, // tree[i].seen_or_unseen); // for(j = 0; j < (ploidy * n_genes); j++) { // printf(" %i", tree[i].HLAs[j]); // } // printf("\n"); // } if(write_tree_to_file == true) { write_tree(tree_data_file, tree, root_node, ploidy, n_genes); fclose(tree_data_file); } printf("Passing sequence information...\n"); pass_codon_sequence_change(codon_sequence_length, ploidy, n_genes, total_n_HLA, root_node, mu, codon_sequence_matrix, tree, leaf_node_index, S_portion, NS_portion, HLA_selection_profiles, wildtype_sequence, omega, reversion_selection); printf("Passed sequence information\n" "Now generating .fasta files of reference and query sequences, and\n" "a .csv file of the HLA information associated to the query sequences.\n"); if(num_queries < 0) { // Set the number of query sequences. num_queries = (int) (query_fraction * leaf_node_index); printf("Number of queries: %i.\n", num_queries); } else { printf("Number of queries: %i.\n", num_queries); } if(num_queries > leaf_node_index) die("Number of query sequences larger than the number of leaves"); int *all_sequences = my_malloc(leaf_node_index * sizeof(int), __FILE__, __LINE__); int num_refs = leaf_node_index - num_queries; for(i = 0; i < leaf_node_index; i++) all_sequences[i] = i; save_simulated_ref_and_query_fasta(num_queries, num_refs, leaf_node_index, all_sequences, codon_sequence_length, codon_sequence_matrix, tree, ploidy, n_genes); // Now save the hla types to a .csv file. fprintf(hla_query_file, "\"\","); for(h = 0; h < total_n_HLA-1; h++) fprintf(hla_query_file, "\"%i\",", h+1); fprintf(hla_query_file, "\"%i\"\n", total_n_HLA); // Write the HLA types of the leaves to a file. int (*hla_types)[total_n_HLA] = my_malloc(leaf_node_index * sizeof(int[total_n_HLA]), __FILE__, __LINE__); for(i = 0; i < leaf_node_index; i++) { for(h = 0; h < total_n_HLA; h++) hla_types[i][h] = 0; for(j = 0; j < (n_genes * ploidy); j++) hla_types[i][tree[i].HLAs[j]] = 1; } // Write the query HLA types to a .csv file. for(i = num_refs; i < leaf_node_index; i++) { fprintf(hla_query_file,"\"simulated_seq_%i_HLA", all_sequences[i]+1); for(h = 0; h < (ploidy * n_genes); h++) fprintf(hla_query_file, "_%i", tree[all_sequences[i]].HLAs[h]); fprintf(hla_query_file, "\""); for(h = 0; h < total_n_HLA; h++) { fprintf(hla_query_file, ",%i", hla_types[all_sequences[i]][h]); } fprintf(hla_query_file, "\n"); } free(hla_types); free(internal_node_times); free(leaf_node_times); free(seen_or_unseen); free(codon_sequence_matrix); free(HLA_selection_profiles); free(all_sequences); free(omega); free(reversion_selection); free(tree[0].HLAs); free(tree); fclose(summary_file); fclose(json_summary_file); fclose(simulated_refs_file); fclose(simulated_root_file); fclose(simulated_queries_file); fclose(hla_query_file); clearup_gsl_dgen(); return EXIT_SUCCESS; }
int main(int argc, char** argv) { std::int8_t m = 5; std::int8_t n = -4; std::int8_t g = -8; std::int8_t e = -6; std::int8_t q = -10; std::int8_t c = -4; std::uint8_t algorithm = 0; std::uint8_t result = 0; std::string dot_path = ""; char opt; while ((opt = getopt_long(argc, argv, "m:n:g:e:q:c:l:r:d:h", options, nullptr)) != -1) { switch (opt) { case 'm': m = atoi(optarg); break; case 'n': n = atoi(optarg); break; case 'g': g = atoi(optarg); break; case 'e': e = atoi(optarg); break; case 'q': q = atoi(optarg); break; case 'c': c = atoi(optarg); break; case 'l': algorithm = atoi(optarg); break; case 'r': result = atoi(optarg); break; case 'd': dot_path = optarg; break; case 'v': std::cout << version << std::endl; return 0; case 'h': help(); return 0; default: return 1; } } if (optind >= argc) { std::cerr << "[spoa::] error: missing input file!" << std::endl; help(); return 1; } std::string sequences_path = argv[optind]; auto is_suffix = [](const std::string& src, const std::string& suffix) -> bool { if (src.size() < suffix.size()) { return false; } return src.compare(src.size() - suffix.size(), suffix.size(), suffix) == 0; }; std::unique_ptr<bioparser::Parser<spoa::Sequence>> sparser = nullptr; if (is_suffix(sequences_path, ".fasta") || is_suffix(sequences_path, ".fa") || is_suffix(sequences_path, ".fasta.gz") || is_suffix(sequences_path, ".fa.gz")) { sparser = bioparser::createParser<bioparser::FastaParser, spoa::Sequence>( sequences_path); } else if (is_suffix(sequences_path, ".fastq") || is_suffix(sequences_path, ".fq") || is_suffix(sequences_path, ".fastq.gz") || is_suffix(sequences_path, ".fq.gz")) { sparser = bioparser::createParser<bioparser::FastqParser, spoa::Sequence>( sequences_path); } else { std::cerr << "[spoa::] error: file " << sequences_path << " has unsupported format extension (valid extensions: .fasta, " ".fasta.gz, .fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz)!" << std::endl; return 1; } std::unique_ptr<spoa::AlignmentEngine> alignment_engine; try { alignment_engine = spoa::createAlignmentEngine( static_cast<spoa::AlignmentType>(algorithm), m, n, g, e, q, c); } catch(std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } auto graph = spoa::createGraph(); std::vector<std::unique_ptr<spoa::Sequence>> sequences; sparser->parse(sequences, -1); std::size_t max_sequence_size = 0; for (const auto& it: sequences) { max_sequence_size = std::max(max_sequence_size, it->data().size()); } alignment_engine->prealloc(max_sequence_size, 4); for (const auto& it: sequences) { auto alignment = alignment_engine->align(it->data(), graph); try { graph->add_alignment(alignment, it->data(), it->quality()); } catch(std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } } if (result == 0 || result == 2) { std::string consensus = graph->generate_consensus(); std::cout << "Consensus (" << consensus.size() << ")" << std::endl; std::cout << consensus << std::endl; } if (result == 1 || result == 2) { std::vector<std::string> msa; graph->generate_multiple_sequence_alignment(msa); std::cout << "Multiple sequence alignment" << std::endl; for (const auto& it: msa) { std::cout << it << std::endl; } } graph->print_dot(dot_path); return 0; }