void actionMakeOtusMothur() { estrarray uarr; eseqclusterData cdata; ldieif(argvc<4,"syntax: "+efile(argv[0]).basename()+" -makeotus_mothur <alignment> <mergelog> <cutoff>"); cout << "# loading seqs file: " << argv[1] << endl; load_seqs(argv[1],uarr); cdata.load(argv[2],uarr.size()); float t=estr(argv[3]).f(); earray<eintarray> otuarr; cdata.getOTU(t,otuarr,uarr.size()); cout << "label\tnumOtus"; for (long i=0; i<otuarr.size(); ++i) cout << "\tOTU" << i; cout << endl; cout << (1.0-t) << "\t" << otuarr.size(); for (long i=0; i<otuarr.size(); ++i){ // cout << ">OTU" << i << " otu_size="<< otuarr[i].size() << endl; cout << "\t" << uarr.keys(otuarr[i][0]); for (long j=1; j<otuarr[i].size(); ++j) cout << "," << uarr.keys(otuarr[i][j]); } cout << endl; exit(0); }
int main(int argc, char **argv) { FILE *input; FILE *repeats = 0; FILE *output; int start_x, end_x, start_y, end_y; debug_config(progname); get_options(argc, argv, progname); unsigned long start_mem, cand_mem, table_mem; input = fopen(sequence_filename, "r"); if(!input) fatal("couldn't open %s: %s\n",sequence_filename,strerror(errno)); if(repeat_filename) { repeats = fopen(repeat_filename, "r"); if(!repeats) fatal("couldn't open %s: %s\n",repeat_filename,strerror(errno)); } if(output_filename) { output = fopen(output_filename, "w"); } else { output = stdout; } // Data is in the form: // >id metadata // data // >id metadata // data // >> // ... set_k(kmer_size); set_window_size(window_size); // If we only give one file, do an all vs. all // on them. if(!second_sequence_filename) { num_seqs = load_seqs(input); start_x = 0; end_x = num_seqs; start_y = 0; end_y = num_seqs; } // If we had two files, do not compare ones from // the same file to each other. else { FILE *input2 = fopen(second_sequence_filename, "r"); if(!input2) { fprintf(stderr, "Could not open file %s for reading.\n", second_sequence_filename); exit(1); } num_seqs = load_seqs_two_files(input, &end_x, input2, &end_y); start_x = 0; start_y = end_x; debug(D_DEBUG,"First file contains %d sequences, stored from (%d,%d].\n", end_x, start_x, end_x); debug(D_DEBUG,"Second file contains %d sequences, stored from (%d,%d].\n", end_y-end_x, start_y, end_y); } fclose(input); debug(D_DEBUG,"Loaded %d sequences\n",num_seqs); init_cand_table(num_seqs * 5); init_mer_table(num_seqs * 5); if(repeats) { int repeat_count = init_repeat_mer_table(repeats, 2000000, 0); fclose(repeats); debug(D_DEBUG,"Loaded %d repeated mers\n", repeat_count); } if(rectangle_size == -1) { // Do get_mem_avail*0.95 to leave some memory for overhead rectangle_size = DYNAMIC_RECTANGLE_SIZE(max_mem_kb); debug(D_DEBUG,"Mem avail: %lu, rectangle size: %d\n",(unsigned long)MEMORY_FOR_MERS(max_mem_kb), rectangle_size); } int curr_start_x = start_x; int curr_start_y = start_y; candidate_t *output_list = 0; int num_in_list; while(curr_start_y < end_y) { while(curr_start_x < end_x) { if(start_x == start_y) { debug(D_DEBUG,"Loading mer table (%d,%d)\n", curr_rect_x, curr_rect_y); } else { debug(D_DEBUG,"Loading mer table for [%d,%d) and [%d,%d)\n",curr_start_x, MIN(curr_start_x + rectangle_size, end_x), curr_start_y, MIN(curr_start_y + rectangle_size, end_y)); } start_mem = get_mem_usage(); load_mer_table_subset(curr_start_x, MIN(curr_start_x + rectangle_size, end_x), curr_start_y, MIN(curr_start_y + rectangle_size, end_y), (curr_start_x == curr_start_y)); table_mem = get_mem_usage(); debug(D_DEBUG,"Finished loading, now generating candidates\n"); debug(D_DEBUG,"Memory used: %lu\n", table_mem - start_mem); generate_candidates(); cand_mem = get_mem_usage(); debug(D_DEBUG,"Total candidates generated: %llu\n", (long long unsigned int) total_cand); debug(D_DEBUG,"Candidate memory used: %lu\n", cand_mem - table_mem); output_list = retrieve_candidates(&num_in_list); output_candidate_list(output, output_list, num_in_list); free(output_list); fflush(output); debug(D_DEBUG,"Now freeing\n"); free_cand_table(); free_mer_table(); debug(D_DEBUG,"Successfully output and freed!\n"); curr_rect_x++; curr_start_x += rectangle_size; } curr_rect_y++; curr_start_y += rectangle_size; curr_rect_x = curr_rect_y; if(start_y == 0) { curr_start_x = curr_start_y; } else { curr_start_x = start_x; } } fclose(output); return 0; }
int main(int argc, char **argv) { init_rand(); setup_gsl_dgen(); dgen_parse_cmdline(argc, argv); int i, j, h, p, k, c; int cons_cap, num_cons = -1, root_cap, num_root = -1; char **cons_seqs, **root_seqs; int internal_node_index = 0, leaf_node_index = M; int max_internal_node_index = 2000000; int max_leaf_node_index = 2000000; int sum_seen_or_unseen = 0; int ploidy, codon_sequence_length = -1, n_genes, total_n_HLA; Decimal mu; if(json_parameters_path == NULL) die("No .json file passed."); load_lengths_for_simulation_from_json(json_parameters_path, &kappa, &mu, &codon_sequence_length, &total_n_HLA, &ploidy, &n_genes); if(ploidy < 1) die("Ploidy is less than 1."); if(n_genes < 1) die("Number of genes is less than 1."); if(cons_path != NULL) { printf("Loading sequences to obtain consensus...\n"); num_cons = load_seqs(cons_path, &cons_seqs, &cons_cap); assert(num_cons > 0); printf("Loaded %i sequences to determine consensus.\n", num_cons); codon_sequence_length = strlen(cons_seqs[0]); printf("Codon_sequence_length: %i\n",codon_sequence_length/3); if(codon_sequence_length % 3 != 0) die("Sequences contain partial codons [%i mod 3 != 0].", codon_sequence_length); for(c = 0; c < num_cons; c++) { if((int) strlen(cons_seqs[c]) != codon_sequence_length) { die("Sequences from which to derive the consensus sequence aren't all " "the same length."); } } codon_sequence_length = codon_sequence_length/3; } if(root_path != NULL) { printf("Loading sequences to obtain root...\n"); num_root = load_seqs(root_path, &root_seqs, &root_cap); printf("Loaded %i sequences to determine root.\n", num_root); if(cons_path == NULL) die("Did not pass a file to find the consensus sequence."); if((int) (strlen(root_seqs[0])/3) != codon_sequence_length) die("Sequences used to determine the root are different lengths to those used for the consensus."); for(c = 0; c < num_root; c++) { if((int) strlen(root_seqs[c]) != 3*codon_sequence_length) { die("Sequences from which to derive the root sequence aren't all " "the same length."); } } } Decimal *internal_node_times = my_malloc(max_internal_node_index * sizeof(Decimal) , __FILE__, __LINE__); Decimal *leaf_node_times = my_malloc(max_leaf_node_index * sizeof(Decimal), __FILE__, __LINE__); int *seen_or_unseen = my_malloc(max_internal_node_index * sizeof(int), __FILE__, __LINE__); birth_death_simulation_backwards(max_internal_node_index, max_leaf_node_index, internal_node_times, leaf_node_times, &internal_node_index, &leaf_node_index, seen_or_unseen, N, M, lambda, mu_tree, past_sampling); for(i = 0; i < internal_node_index; i++) sum_seen_or_unseen += seen_or_unseen[i]; int total_nodes = (2 * leaf_node_index) - 1 + internal_node_index - sum_seen_or_unseen; Tree *tree = my_malloc((total_nodes+1) * sizeof(Tree), __FILE__, __LINE__); // Now malloc the memory that this points to. int *HLAs_in_tree = my_malloc((total_nodes+1) * ploidy * n_genes * sizeof(int), __FILE__, __LINE__); for(i = 0; i < total_nodes; i++) tree[i].HLAs = &HLAs_in_tree[i * ploidy * n_genes]; construct_birth_death_tree(leaf_node_index, internal_node_index, leaf_node_times, internal_node_times, M, seen_or_unseen, tree); // Reverse the direction that time is measured in the tree. // DEV: Don't need to do this, waste of computation - sort. // DEV: The parent times are wrong when there are unseen nodes. for(i = 0; i < total_nodes; i++) tree[i].node_time = tree[total_nodes-1].node_time - tree[i].node_time; int root_node = tree[total_nodes-1].node; if(write_newick_tree_to_file == true) { write_newick_tree(newick_tree_data_file, tree, root_node, 1); fclose(newick_tree_data_file); } Decimal S_portion[NUM_CODONS]; Decimal NS_portion[NUM_CODONS]; for(c = 0; c < NUM_CODONS; c++) { S_portion[c] = kappa * beta_S[c] + beta_V[c]; NS_portion[c] = kappa * alpha_S[c] + alpha_V[c]; } int n_HLA[n_genes]; printf("Total number of HLA types: %i.\n", total_n_HLA); Decimal HLA_prevalences[total_n_HLA]; int wildtype_sequence[codon_sequence_length]; Decimal *R = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__); Decimal *omega = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__); Decimal *reversion_selection = my_malloc(codon_sequence_length * sizeof(Decimal), __FILE__, __LINE__); memory_allocation(num_cons, num_root, codon_sequence_length, max_internal_node_index, max_leaf_node_index, total_nodes, ploidy, n_genes, total_n_HLA, leaf_node_index); int (*codon_sequence_matrix)[codon_sequence_length] = my_malloc(total_nodes * sizeof(int[codon_sequence_length]), __FILE__, __LINE__); Decimal (*HLA_selection_profiles)[codon_sequence_length] = my_malloc(total_n_HLA * sizeof(Decimal[codon_sequence_length]), __FILE__, __LINE__); load_parameters_for_simulation_from_json(json_parameters_path, codon_sequence_length, omega, R, reversion_selection, total_n_HLA, n_genes, n_HLA, HLA_prevalences, HLA_selection_profiles); Decimal sum_check; for(i = 0, k = 0; i < n_genes; i++) { sum_check = 0; for(h = 0; h < n_HLA[i]; h++, k++) { sum_check += HLA_prevalences[k]; } if(sum_check > 1.00001 || sum_check < 0.9999) die("HLA prevalences for gene %i do not sum to 1\n", i+1); } if(cons_path != NULL) { printf("Mapping gaps to consensus...\n"); // Set the consensus sequence - the consensus of the optional sequence file // that is passed. char wildtype_sequence_dummy[3*codon_sequence_length+1]; generate_consensus(cons_seqs, num_cons, 3*codon_sequence_length, wildtype_sequence_dummy); printf("Wildtype sequence:\n%s\n", wildtype_sequence_dummy); // By default, set the root as the wildtype sequence. for(i = 0; i < codon_sequence_length; i++) wildtype_sequence[i] = (int) amino_to_code(wildtype_sequence_dummy+i*3); if(root_path == NULL) { for(i = 0; i < codon_sequence_length; i++) codon_sequence_matrix[root_node][i] = wildtype_sequence[i]; } else { printf("Mapping gaps to root...\n"); char root_sequence_dummy[3*codon_sequence_length+1]; generate_consensus(root_seqs, num_root, 3*codon_sequence_length, root_sequence_dummy); printf("Root sequence:\n%s\n", root_sequence_dummy); for(i = 0; i < codon_sequence_length; i++) codon_sequence_matrix[root_node][i] = (int) amino_to_code(root_sequence_dummy+i*3); printf("Number of root sequences: %i.\n", num_root); for(c = 0; c < num_root; c++) free(root_seqs[c]); free(root_seqs); } printf("Number of consensus sequences: %i.\n", num_cons); for(c = 0; c < num_cons; c++) free(cons_seqs[c]); free(cons_seqs); } else { for(i = 0; i < codon_sequence_length; i++) { // Sample the root sequence according to the HIV codon usage information. codon_sequence_matrix[root_node][i] = discrete_sampling_dist(NUM_CODONS, prior_C1); // As default, set the root node to the consensus sequence. wildtype_sequence[i] = codon_sequence_matrix[root_node][i]; } } // No matter what is read in, there is no recombination simulated - so make sure it's set to 0. for(i = 0; i < codon_sequence_length; i++) R[i] = 0; write_summary_json(json_summary_file, mu, codon_sequence_length, ploidy, n_genes, n_HLA, total_n_HLA, HLA_prevalences, omega, R, reversion_selection, HLA_selection_profiles); free(R); fprintf(simulated_root_file, ">root_sequence\n"); for(i = 0; i < codon_sequence_length; i++) fprintf(simulated_root_file, "%s", code_to_char(codon_sequence_matrix[root_node][i])); fprintf(simulated_root_file, "\n"); int root_HLA[ploidy * n_genes]; int cumulative_n_HLA = 0; for(i = 0, k = 0; i < n_genes; i++) { for(p = 0; p < ploidy; p++, k++) { root_HLA[k] = cumulative_n_HLA + discrete_sampling_dist(n_HLA[i], &HLA_prevalences[cumulative_n_HLA]); tree[root_node].HLAs[k] = root_HLA[k]; } cumulative_n_HLA = cumulative_n_HLA + n_HLA[i]; } printf("Passing HLA information...\n"); pass_HLA(ploidy, n_genes, root_node, tree, leaf_node_index, total_n_HLA, n_HLA, HLA_prevalences); printf("Passed HLA information\n"); // printf("Printing the tree\n"); // for(i = 0; i < total_nodes; i++) { // printf("%i %i %i "DECPRINT" %i", tree[i].node, tree[i].daughter_nodes[0], // tree[i].daughter_nodes[1], tree[i].node_time, // tree[i].seen_or_unseen); // for(j = 0; j < (ploidy * n_genes); j++) { // printf(" %i", tree[i].HLAs[j]); // } // printf("\n"); // } if(write_tree_to_file == true) { write_tree(tree_data_file, tree, root_node, ploidy, n_genes); fclose(tree_data_file); } printf("Passing sequence information...\n"); pass_codon_sequence_change(codon_sequence_length, ploidy, n_genes, total_n_HLA, root_node, mu, codon_sequence_matrix, tree, leaf_node_index, S_portion, NS_portion, HLA_selection_profiles, wildtype_sequence, omega, reversion_selection); printf("Passed sequence information\n" "Now generating .fasta files of reference and query sequences, and\n" "a .csv file of the HLA information associated to the query sequences.\n"); if(num_queries < 0) { // Set the number of query sequences. num_queries = (int) (query_fraction * leaf_node_index); printf("Number of queries: %i.\n", num_queries); } else { printf("Number of queries: %i.\n", num_queries); } if(num_queries > leaf_node_index) die("Number of query sequences larger than the number of leaves"); int *all_sequences = my_malloc(leaf_node_index * sizeof(int), __FILE__, __LINE__); int num_refs = leaf_node_index - num_queries; for(i = 0; i < leaf_node_index; i++) all_sequences[i] = i; save_simulated_ref_and_query_fasta(num_queries, num_refs, leaf_node_index, all_sequences, codon_sequence_length, codon_sequence_matrix, tree, ploidy, n_genes); // Now save the hla types to a .csv file. fprintf(hla_query_file, "\"\","); for(h = 0; h < total_n_HLA-1; h++) fprintf(hla_query_file, "\"%i\",", h+1); fprintf(hla_query_file, "\"%i\"\n", total_n_HLA); // Write the HLA types of the leaves to a file. int (*hla_types)[total_n_HLA] = my_malloc(leaf_node_index * sizeof(int[total_n_HLA]), __FILE__, __LINE__); for(i = 0; i < leaf_node_index; i++) { for(h = 0; h < total_n_HLA; h++) hla_types[i][h] = 0; for(j = 0; j < (n_genes * ploidy); j++) hla_types[i][tree[i].HLAs[j]] = 1; } // Write the query HLA types to a .csv file. for(i = num_refs; i < leaf_node_index; i++) { fprintf(hla_query_file,"\"simulated_seq_%i_HLA", all_sequences[i]+1); for(h = 0; h < (ploidy * n_genes); h++) fprintf(hla_query_file, "_%i", tree[all_sequences[i]].HLAs[h]); fprintf(hla_query_file, "\""); for(h = 0; h < total_n_HLA; h++) { fprintf(hla_query_file, ",%i", hla_types[all_sequences[i]][h]); } fprintf(hla_query_file, "\n"); } free(hla_types); free(internal_node_times); free(leaf_node_times); free(seen_or_unseen); free(codon_sequence_matrix); free(HLA_selection_profiles); free(all_sequences); free(omega); free(reversion_selection); free(tree[0].HLAs); free(tree); fclose(summary_file); fclose(json_summary_file); fclose(simulated_refs_file); fclose(simulated_root_file); fclose(simulated_queries_file); fclose(hla_query_file); clearup_gsl_dgen(); return EXIT_SUCCESS; }
void actionMakeReps() { ldieif(argvc<3,"syntax: "+efile(argv[0]).basename()+" -makereps <alignment> <otu>"); estrhashof<INDTYPE> seqind; estrarray uarr; cout << "# loading seqs file: " << argv[1] << endl; load_seqs_compressed(argv[1],arr,seqind,seqlen); load_seqs(argv[1],uarr); earray<ebasicarray<INDTYPE> > otus; efile f; estr line; estrarray parts; f.open(argv[2],"r"); while (!f.eof()){ f.readln(line); if (line.len()==0 || line[0]=='#') continue; if (line[0]=='>'){ otus.add(ebasicarray<INDTYPE>()); continue; } ldieif(otus.size()==0,"first entry not start of OTU or missing '>'"); parts=line.explode("\t"); ldieif(parts.size()==0,"array empty: "+line); ldieif(!seqind.exists(parts[0]),"sequence not found: "+parts[0]); otus[otus.size()-1].add(seqind[parts[0]]); } cerr << endl; ebasicarray<INDTYPE> tuniqind; earray<ebasicarray<INDTYPE> > dupslist; finduniq(tuniqind,dupslist); eintarray uniqmask; uniqmask.init(arr.size(),0); for (long i=0; i<tuniqind.size(); ++i) uniqmask[tuniqind[i]]=dupslist[i].size(); // ebasicarray<INDTYPE> uniqind; taskman.createThread(nthreads); ebasicarray<INDTYPE> uniqind; const float t=0.0; efloatarray avgdist; for (long j=0; j<otus.size(); ++j){ // cout << "# computing distances for otu "<< j << " size: " << otus[j].size() << endl; if (otus[j].size()==1){ cout << ">OTU" << j << " " << arr.keys(otus[j][0]) << " avg_id=1.0 otu_size=1" << endl; cout << uarr.values(otus[j][0]) << endl; continue; } uniqind.clear(); for (long l=0; l<otus[j].size(); ++l){ if (uniqmask[otus[j][l]]!=0) uniqind.add(otus[j][l]); } // uniqind=otus[j]; ldieif(uniqind.size()==0,"empty OTU"); if (uniqind.size()==1){ cout << ">OTU" << j << " " << arr.keys(uniqind[0]) << " avg_id=1.0 otu_size=" << otus[j].size() << endl; cout << uarr.values(uniqind[0]) << endl; continue; } avgdist.clear(); avgdist.init(arr.size(),0.0); dists.clear(); partsTotal=10000; if (partsTotal>(uniqind.size()-1l)*uniqind.size()/20l) partsTotal=(uniqind.size()-1l)*uniqind.size()/20l; // make fewer tasks if to few calculations per task if (partsTotal<=0) partsTotal=1; taskman.clear(); for (long i=0; i<partsTotal; ++i) taskman.addTask(dfunc.value().calcfunc,evararray(mutex,uniqind,arr,dists,(const int&)seqlen,(const long int&)i,(const long int&)partsTotal,(const float&)t,(const int&)winlen)); taskman.wait(); for (long i=0; i<dists.size(); ++i){ eseqdist& d(dists[i]); avgdist[d.x]+=d.dist*uniqmask[d.y]; avgdist[d.y]+=d.dist*uniqmask[d.x]; // cout << "# "<< arr.keys(d.x) << " " << arr.keys(d.y) << " " << d.dist << " " << uniqmask[d.x] << " " << uniqmask[d.y] << endl; } long k=uniqind[0]; for (long i=0; i<uniqind.size(); ++i){ long ti=uniqind[i]; avgdist[ti]+=uniqmask[ti]-1; if (avgdist[k]<avgdist[ti]) { // cout << "# " << arr.keys(ti) << " " << ti << " " << uniqmask[ti] << " " << avgdist[ti] << " " << counts[ti] << endl; k=ti; } } // cout << "OTU" << j << " " << otus[j].size() << " " << arr.keys(k) << " " << avgdist[k]/(otus[j].size()-1) << " " << dists.size() << endl; cout << ">OTU" << j << " " << arr.keys(k) << " avg_id=" << avgdist[k]/(otus[j].size()-1) << " otu_size=" << otus[j].size() << endl; cout << uarr.values(k) << endl; } cerr << endl; exit(0); }