Example #1
0
void load_sequences_from_multifasta_file(char filename[])
{
	int i;

	num_snps    = genome_length(filename);
	num_samples = number_of_sequences_in_file(filename);
	
	sequences = (char **) calloc((num_samples+1),sizeof(char *));
	phylip_sample_names = (char **) calloc((num_samples+1),sizeof(char *));
	
	for(i = 0; i < num_samples; i++)
	{
		sequences[i] = (char *) calloc((num_snps+1),sizeof(char));
		phylip_sample_names[i] = (char *) calloc((MAX_SAMPLE_NAME_SIZE+1),sizeof(char));
	}
	get_sample_names_for_header(filename, phylip_sample_names, num_samples);
	
  int l;
  i = 0;
  int sequence_number = 0;

 	gzFile fp;
 	kseq_t *seq;

 	fp = gzopen(filename, "r");
 	seq = kseq_init(fp);

 	while ((l = kseq_read(seq)) >= 0) 
 	{
     for(i = 0; i< num_snps; i++)
 		{
 			sequences[sequence_number][i] = toupper(((char *) seq->seq.s)[i]);
 			if(sequences[sequence_number][i] == 'N')
 			{
 				sequences[sequence_number][i]  = '-';
 			}
 		}
     sequence_number++;
   }

 	kseq_destroy(seq);
 	gzclose(fp);

	initialise_statistics();
	initialise_internal_node();
}
Example #2
0
int generate_snp_sites(char filename[],int output_multi_fasta_file, int output_vcf_file, int output_phylip_file, char output_filename[])
{
	size_t length_of_genome;
	char * reference_sequence;
	int number_of_snps;
	int * snp_locations;
	int number_of_samples;
	int i;
	
	length_of_genome = genome_length(filename);
	reference_sequence = (char *) calloc((length_of_genome +1),sizeof(char));
	
	build_reference_sequence(reference_sequence,filename);
	number_of_snps = detect_snps(reference_sequence, filename, length_of_genome);
	
	snp_locations = (int *) calloc((number_of_snps+1),sizeof(int));
	build_snp_locations(snp_locations, reference_sequence);
	free(reference_sequence);
	
	number_of_samples = number_of_sequences_in_file(filename);
	
	// Find out the names of the sequences
	char* sequence_names[number_of_samples];
	sequence_names[number_of_samples-1] = '\0';
	for(i = 0; i < number_of_samples; i++)
	{
		sequence_names[i] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char));
	}
	
	get_sample_names_for_header(filename, sequence_names, number_of_samples);
	
	char* bases_for_snps[number_of_snps];
	
	for(i = 0; i < number_of_snps; i++)
	{
		bases_for_snps[i] = calloc(number_of_samples+1 ,sizeof(char));
	}
	
	get_bases_for_each_snp(filename, snp_locations, bases_for_snps, length_of_genome, number_of_snps);
	
	char output_filename_base[FILENAME_MAX];
	char filename_without_directory[FILENAME_MAX];
	strip_directory_from_filename(filename, filename_without_directory);
	strncpy(output_filename_base, filename_without_directory, FILENAME_MAX);
	
	if(output_filename != NULL && *output_filename != '\0')
	{
		strncpy(output_filename_base, output_filename, FILENAME_MAX);
	}

	if(output_vcf_file)
	{
		char vcf_output_filename[FILENAME_MAX];
		strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(vcf_output_filename, ".vcf");
		}
		
	  create_vcf_file(vcf_output_filename, snp_locations, number_of_snps, bases_for_snps, sequence_names, number_of_samples, length_of_genome);
  }

  if(output_phylip_file)
  {
		char phylip_output_filename[FILENAME_MAX];
		strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(phylip_output_filename, ".phylip");
		}
	  create_phylib_of_snp_sites(phylip_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples);
  }

  if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0))
  {
		char multi_fasta_output_filename[FILENAME_MAX];
		strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(multi_fasta_output_filename, ".snp_sites.aln");
		}
	  create_fasta_of_snp_sites(multi_fasta_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples);
  }

  // free memory
	free(snp_locations);
	for(i = 0; i < number_of_samples; i++)
	{
		free(sequence_names[i]);
	}
	for(i = 0; i < number_of_snps; i++)
	{
		free(bases_for_snps[i]);
	}
	

	return 1;
}