int generate_snp_sites(char filename[],int output_multi_fasta_file, int output_vcf_file, int output_phylip_file, char output_filename[]) { size_t length_of_genome; char * reference_sequence; int number_of_snps; int * snp_locations; int number_of_samples; int i; length_of_genome = genome_length(filename); reference_sequence = (char *) calloc((length_of_genome +1),sizeof(char)); build_reference_sequence(reference_sequence,filename); number_of_snps = detect_snps(reference_sequence, filename, length_of_genome); snp_locations = (int *) calloc((number_of_snps+1),sizeof(int)); build_snp_locations(snp_locations, reference_sequence); free(reference_sequence); number_of_samples = number_of_sequences_in_file(filename); // Find out the names of the sequences char* sequence_names[number_of_samples]; sequence_names[number_of_samples-1] = '\0'; for(i = 0; i < number_of_samples; i++) { sequence_names[i] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char)); } get_sample_names_for_header(filename, sequence_names, number_of_samples); char* bases_for_snps[number_of_snps]; for(i = 0; i < number_of_snps; i++) { bases_for_snps[i] = calloc(number_of_samples+1 ,sizeof(char)); } get_bases_for_each_snp(filename, snp_locations, bases_for_snps, length_of_genome, number_of_snps); char output_filename_base[FILENAME_MAX]; char filename_without_directory[FILENAME_MAX]; strip_directory_from_filename(filename, filename_without_directory); strncpy(output_filename_base, filename_without_directory, FILENAME_MAX); if(output_filename != NULL && *output_filename != '\0') { strncpy(output_filename_base, output_filename, FILENAME_MAX); } if(output_vcf_file) { char vcf_output_filename[FILENAME_MAX]; strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { strcat(vcf_output_filename, ".vcf"); } create_vcf_file(vcf_output_filename, snp_locations, number_of_snps, bases_for_snps, sequence_names, number_of_samples, length_of_genome); } if(output_phylip_file) { char phylip_output_filename[FILENAME_MAX]; strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { strcat(phylip_output_filename, ".phylip"); } create_phylib_of_snp_sites(phylip_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples); } if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0)) { char multi_fasta_output_filename[FILENAME_MAX]; strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { strcat(multi_fasta_output_filename, ".snp_sites.aln"); } create_fasta_of_snp_sites(multi_fasta_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples); } // free memory free(snp_locations); for(i = 0; i < number_of_samples; i++) { free(sequence_names[i]); } for(i = 0; i < number_of_snps; i++) { free(bases_for_snps[i]); } return 1; }
static int generate_snp_sites_generic(char filename[], int output_multi_fasta_file, int output_vcf_file, int output_phylip_file, char output_filename[], int output_reference, int pure_mode, int output_monomorphic) { int i; detect_snps(filename, pure_mode, output_monomorphic); bases_for_snps = calloc(get_number_of_snps()+1, sizeof(char*)); for(i = 0; i < get_number_of_snps(); i++) { bases_for_snps[i] = calloc(get_number_of_samples()+1, sizeof(char)); } get_bases_for_each_snp(filename, bases_for_snps); char output_filename_base[FILENAME_MAX]; char filename_without_directory[FILENAME_MAX]; strip_directory_from_filename(filename, filename_without_directory); strncpy(output_filename_base, filename_without_directory, FILENAME_MAX); if(output_filename != NULL && *output_filename != '\0') { strncpy(output_filename_base, output_filename, FILENAME_MAX); } if(output_vcf_file) { char vcf_output_filename[FILENAME_MAX]; strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { strcat(vcf_output_filename, ".vcf"); } create_vcf_file(vcf_output_filename, get_snp_locations(), get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), get_length_of_genome(), get_pseudo_reference_sequence()); } if(output_phylip_file) { char phylip_output_filename[FILENAME_MAX]; strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { strcat(phylip_output_filename, ".phylip"); } create_phylib_of_snp_sites(phylip_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations()); } if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0)) { char multi_fasta_output_filename[FILENAME_MAX]; strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { strcat(multi_fasta_output_filename, ".snp_sites.aln"); } create_fasta_of_snp_sites(multi_fasta_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations()); } // free memory free(get_snp_locations()); for(i = 0; i < get_number_of_samples(); i++) { // free(get_sequence_names().[i]); } for(i = 0; i < get_number_of_snps(); i++) { free(bases_for_snps[i]); } free(get_pseudo_reference_sequence()); return 1; }
void extract_sequences(char vcf_filename[], char tree_filename[],char multi_fasta_filename[],int min_snps, char original_multi_fasta_filename[]) { FILE *vcf_file_pointer; vcf_file_pointer=fopen(vcf_filename, "r"); newick_node* root_node; int number_of_snps; int number_of_columns; int i; int length_of_original_genome; length_of_original_genome = genome_length(original_multi_fasta_filename); number_of_columns = get_number_of_columns_from_file(vcf_file_pointer); char* column_names[number_of_columns]; for(i = 0; i < number_of_columns; i++) { column_names[i] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char)); } get_column_names(vcf_file_pointer, column_names, number_of_columns); number_of_snps = number_of_snps_in_phylip(); int snp_locations[number_of_snps]; get_integers_from_column_in_vcf(vcf_file_pointer, snp_locations, number_of_snps, column_number_for_column_name(column_names, "POS", number_of_columns)); root_node = build_newick_tree(tree_filename, vcf_file_pointer,snp_locations, number_of_snps, column_names, number_of_columns, length_of_original_genome,min_snps); fclose(vcf_file_pointer); int filtered_snp_locations[number_of_snps]; int number_of_filtered_snps; int number_of_samples = number_of_samples_from_parse_phylip(); char * sample_names[number_of_samples]; get_sample_names_from_parse_phylip(sample_names); char * reference_sequence_bases; reference_sequence_bases = (char *) calloc((number_of_snps+1),sizeof(char)); get_sequence_for_sample_name(reference_sequence_bases, sample_names[0]); int internal_nodes[number_of_samples]; int a = 0; for(a =0; a < number_of_samples; a++) { internal_nodes[a] = get_internal_node(a); } number_of_filtered_snps = refilter_existing_snps(reference_sequence_bases, number_of_snps, snp_locations, filtered_snp_locations,internal_nodes); char * filtered_bases_for_snps[number_of_filtered_snps]; filter_sequence_bases_and_rotate(reference_sequence_bases, filtered_bases_for_snps, number_of_filtered_snps); create_phylip_of_snp_sites(tree_filename, number_of_filtered_snps, filtered_bases_for_snps, sample_names, number_of_samples,internal_nodes); create_vcf_file(tree_filename, filtered_snp_locations, number_of_filtered_snps, filtered_bases_for_snps, sample_names, number_of_samples,internal_nodes,0); create_fasta_of_snp_sites(tree_filename, number_of_filtered_snps, filtered_bases_for_snps, sample_names, number_of_samples,internal_nodes); // Create an new tree with updated distances scale_branch_distances(root_node, number_of_filtered_snps); FILE *output_tree_pointer; output_tree_pointer=fopen(tree_filename, "w"); print_tree(root_node,output_tree_pointer); fprintf(output_tree_pointer,";"); fflush(output_tree_pointer); fclose(output_tree_pointer); // Theres a seg fault in here for(i = 0; i < number_of_columns; i++) { free(column_names[i] ); } for(i=0; i<number_of_samples; i++ ) { free(sample_names[i]); } for(i=0; i<number_of_filtered_snps; i++ ) { free(filtered_bases_for_snps[i]); } cleanup_node_memory(root_node); seqFreeAll(); free(reference_sequence_bases); }