char * read_line(char sequence[], FILE * pFilePtr) { char *pcRes = NULL; long lineLength = 0; char current_line_buffer[MAX_READ_BUFFER] = {0}; while((pcRes = fgets(current_line_buffer, sizeof(current_line_buffer), pFilePtr)) != NULL){ if(size_of_string(sequence) > 0) { sequence = realloc(sequence, sizeof(char)*(size_of_string(sequence) + size_of_string(current_line_buffer) + 2) ); } concat_strings_created_with_malloc(sequence,current_line_buffer); current_line_buffer[0] = '\0'; lineLength = size_of_string(sequence); //if end of line character is found then exit from loop if((sequence)[lineLength] == '\n' || (sequence)[lineLength] == '\0'){ break; } } return sequence; }
void create_phylip_of_snp_sites(char filename[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples, int internal_nodes[]) { FILE *fasta_file_pointer; int sample_counter; int snp_counter; char * base_filename; base_filename = (char *) calloc(1024,sizeof(char)); memcpy(base_filename, filename, 1024*sizeof(char)); char extension[8] = {".phylip"}; concat_strings_created_with_malloc(base_filename,extension); fasta_file_pointer = fopen(base_filename, "w"); int number_of_leaves = number_of_samples; for(sample_counter=0; sample_counter< number_of_samples; sample_counter++) { if(internal_nodes[sample_counter] == 1) { number_of_leaves--; } } fprintf( fasta_file_pointer, "%d %d\n", number_of_leaves, number_of_snps); for(sample_counter=0; sample_counter< number_of_samples; sample_counter++) { // sequence_name can be more than 10 (relaxed phylip format) and contain [\w\s] //TODO check for illegal characters [^\w\s] if(internal_nodes[sample_counter] == 1) { continue; } fprintf( fasta_file_pointer, "%s\t", sequence_names[sample_counter]); for(snp_counter=0; snp_counter< number_of_snps; snp_counter++) { fprintf( fasta_file_pointer, "%c", bases_for_snps[snp_counter][sample_counter]); } fprintf( fasta_file_pointer, "\n"); } fclose(fasta_file_pointer); free(base_filename); }
int generate_snp_sites(char filename[],int output_multi_fasta_file, int output_vcf_file, int output_phylip_file, char output_filename[]) { size_t length_of_genome; char * reference_sequence; int number_of_snps; int * snp_locations; int number_of_samples; int i; length_of_genome = genome_length(filename); reference_sequence = (char *) calloc((length_of_genome +1),sizeof(char)); build_reference_sequence(reference_sequence,filename); number_of_snps = detect_snps(reference_sequence, filename, length_of_genome); snp_locations = (int *) calloc((number_of_snps+1),sizeof(int)); build_snp_locations(snp_locations, reference_sequence); free(reference_sequence); number_of_samples = number_of_sequences_in_file(filename); // Find out the names of the sequences char* sequence_names[number_of_samples]; sequence_names[number_of_samples-1] = '\0'; for(i = 0; i < number_of_samples; i++) { sequence_names[i] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char)); } get_sample_names_for_header(filename, sequence_names, number_of_samples); char* bases_for_snps[number_of_snps]; for(i = 0; i < number_of_snps; i++) { bases_for_snps[i] = calloc(number_of_samples+1 ,sizeof(char)); } get_bases_for_each_snp(filename, snp_locations, bases_for_snps, length_of_genome, number_of_snps); char output_filename_base[MAX_FILENAME_SIZE]; char filename_without_directory[MAX_FILENAME_SIZE]; strip_directory_from_filename(filename, filename_without_directory); memcpy(output_filename_base,filename_without_directory, size_of_string(filename_without_directory)+1 ); if(output_filename != NULL && *output_filename != '\0') { memcpy(output_filename_base,output_filename, size_of_string(output_filename)+1 ); } if(output_vcf_file) { char * vcf_output_filename; vcf_output_filename = calloc(MAX_FILENAME_SIZE,sizeof(char)); memcpy(vcf_output_filename, output_filename_base, (MAX_FILENAME_SIZE)*sizeof(char)); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { char extension[5] = {".vcf"}; concat_strings_created_with_malloc(vcf_output_filename,extension); } create_vcf_file(vcf_output_filename, snp_locations, number_of_snps, bases_for_snps, sequence_names, number_of_samples, length_of_genome); free(vcf_output_filename); } if(output_phylip_file) { char *phylip_output_filename; phylip_output_filename = calloc(MAX_FILENAME_SIZE,sizeof(char)); memcpy(phylip_output_filename, output_filename_base, (MAX_FILENAME_SIZE)*sizeof(char)); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { char extension[10] = {".phylip"}; concat_strings_created_with_malloc(phylip_output_filename,extension); } create_phylib_of_snp_sites(phylip_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples); free(phylip_output_filename); } if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0)) { char *multi_fasta_output_filename; multi_fasta_output_filename = calloc(MAX_FILENAME_SIZE,sizeof(char)); memcpy(multi_fasta_output_filename, output_filename_base, (MAX_FILENAME_SIZE)*sizeof(char)); if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') ) { char extension[20] = {".snp_sites.aln"}; concat_strings_created_with_malloc(multi_fasta_output_filename,extension); } create_fasta_of_snp_sites(multi_fasta_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples); free(multi_fasta_output_filename); } // free memory free(snp_locations); for(i = 0; i < number_of_samples; i++) { free(sequence_names[i]); } for(i = 0; i < number_of_snps; i++) { free(bases_for_snps[i]); } return 1; }