void load_sequences_from_multifasta_file(char filename[]) { int i; num_snps = genome_length(filename); num_samples = number_of_sequences_in_file(filename); sequences = (char **) calloc((num_samples+1),sizeof(char *)); phylip_sample_names = (char **) calloc((num_samples+1),sizeof(char *)); for(i = 0; i < num_samples; i++) { sequences[i] = (char *) calloc((num_snps+1),sizeof(char)); phylip_sample_names[i] = (char *) calloc((MAX_SAMPLE_NAME_SIZE+1),sizeof(char)); } get_sample_names_for_header(filename, phylip_sample_names, num_samples); int l; i = 0; int sequence_number = 0; gzFile fp; kseq_t *seq; fp = gzopen(filename, "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { for(i = 0; i< num_snps; i++) { sequences[sequence_number][i] = toupper(((char *) seq->seq.s)[i]); if(sequences[sequence_number][i] == 'N') { sequences[sequence_number][i] = '-'; } } sequence_number++; } kseq_destroy(seq); gzclose(fp); initialise_statistics(); initialise_internal_node(); }
int generate_snp_sites(char filename[],int output_multi_fasta_file, int output_vcf_file, int output_phylip_file, char output_filename[]) { size_t length_of_genome; char * reference_sequence; int number_of_snps; int * snp_locations; int number_of_samples; int i; length_of_genome = genome_length(filename); reference_sequence = (char *) calloc((length_of_genome +1),sizeof(char)); build_reference_sequence(reference_sequence,filename); number_of_snps = detect_snps(reference_sequence, filename, length_of_genome); snp_locations = (int *) calloc((number_of_snps+1),sizeof(int)); build_snp_locations(snp_locations, reference_sequence); free(reference_sequence); number_of_samples = number_of_sequences_in_file(filename); // Find out the names of the sequences char* sequence_names[number_of_samples]; sequence_names[number_of_samples-1] = '\0'; for(i = 0; i < number_of_samples; i++) { sequence_names[i] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char)); } get_sample_names_for_header(filename, sequence_names, number_of_samples); char* bases_for_snps[number_of_snps]; for(i = 0; i < number_of_snps; i++) { bases_for_snps[i] = calloc(number_of_samples+1 ,sizeof(char)); } get_bases_for_each_snp(filename, snp_locations, bases_for_snps, length_of_genome, number_of_snps); char output_filename_base[FILENAME_MAX]; char filename_without_directory[FILENAME_MAX]; strip_directory_from_filename(filename, filename_without_directory); strncpy(output_filename_base, filename_without_directory, FILENAME_MAX); if(output_filename != NULL && *output_filename != '\0') { strncpy(output_filename_base, output_filename, FILENAME_MAX); } if(output_vcf_file) { char vcf_output_filename[FILENAME_MAX]; strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { strcat(vcf_output_filename, ".vcf"); } create_vcf_file(vcf_output_filename, snp_locations, number_of_snps, bases_for_snps, sequence_names, number_of_samples, length_of_genome); } if(output_phylip_file) { char phylip_output_filename[FILENAME_MAX]; strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { strcat(phylip_output_filename, ".phylip"); } create_phylib_of_snp_sites(phylip_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples); } if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0)) { char multi_fasta_output_filename[FILENAME_MAX]; strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { strcat(multi_fasta_output_filename, ".snp_sites.aln"); } create_fasta_of_snp_sites(multi_fasta_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples); } // free memory free(snp_locations); for(i = 0; i < number_of_samples; i++) { free(sequence_names[i]); } for(i = 0; i < number_of_snps; i++) { free(bases_for_snps[i]); } return 1; }