예제 #1
0
int generate_snp_sites(char filename[],int output_multi_fasta_file, int output_vcf_file, int output_phylip_file, char output_filename[])
{
	size_t length_of_genome;
	char * reference_sequence;
	int number_of_snps;
	int * snp_locations;
	int number_of_samples;
	int i;
	
	length_of_genome = genome_length(filename);
	reference_sequence = (char *) calloc((length_of_genome +1),sizeof(char));
	
	build_reference_sequence(reference_sequence,filename);
	number_of_snps = detect_snps(reference_sequence, filename, length_of_genome);
	
	snp_locations = (int *) calloc((number_of_snps+1),sizeof(int));
	build_snp_locations(snp_locations, reference_sequence);
	free(reference_sequence);
	
	number_of_samples = number_of_sequences_in_file(filename);
	
	// Find out the names of the sequences
	char* sequence_names[number_of_samples];
	sequence_names[number_of_samples-1] = '\0';
	for(i = 0; i < number_of_samples; i++)
	{
		sequence_names[i] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char));
	}
	
	get_sample_names_for_header(filename, sequence_names, number_of_samples);
	
	char* bases_for_snps[number_of_snps];
	
	for(i = 0; i < number_of_snps; i++)
	{
		bases_for_snps[i] = calloc(number_of_samples+1 ,sizeof(char));
	}
	
	get_bases_for_each_snp(filename, snp_locations, bases_for_snps, length_of_genome, number_of_snps);
	
	char output_filename_base[FILENAME_MAX];
	char filename_without_directory[FILENAME_MAX];
	strip_directory_from_filename(filename, filename_without_directory);
	strncpy(output_filename_base, filename_without_directory, FILENAME_MAX);
	
	if(output_filename != NULL && *output_filename != '\0')
	{
		strncpy(output_filename_base, output_filename, FILENAME_MAX);
	}

	if(output_vcf_file)
	{
		char vcf_output_filename[FILENAME_MAX];
		strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(vcf_output_filename, ".vcf");
		}
		
	  create_vcf_file(vcf_output_filename, snp_locations, number_of_snps, bases_for_snps, sequence_names, number_of_samples, length_of_genome);
  }

  if(output_phylip_file)
  {
		char phylip_output_filename[FILENAME_MAX];
		strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(phylip_output_filename, ".phylip");
		}
	  create_phylib_of_snp_sites(phylip_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples);
  }

  if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0))
  {
		char multi_fasta_output_filename[FILENAME_MAX];
		strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(multi_fasta_output_filename, ".snp_sites.aln");
		}
	  create_fasta_of_snp_sites(multi_fasta_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples);
  }

  // free memory
	free(snp_locations);
	for(i = 0; i < number_of_samples; i++)
	{
		free(sequence_names[i]);
	}
	for(i = 0; i < number_of_snps; i++)
	{
		free(bases_for_snps[i]);
	}
	

	return 1;
}
예제 #2
0
static int generate_snp_sites_generic(char filename[],
                                      int output_multi_fasta_file,
                                      int output_vcf_file,
                                      int output_phylip_file,
                                      char output_filename[],
                                      int output_reference, int pure_mode, int output_monomorphic)
{
	int i;
	detect_snps(filename, pure_mode, output_monomorphic);

  bases_for_snps =  calloc(get_number_of_snps()+1, sizeof(char*));
  
	for(i = 0; i < get_number_of_snps(); i++)
	{
		bases_for_snps[i] = calloc(get_number_of_samples()+1, sizeof(char));
	}
  
	get_bases_for_each_snp(filename, bases_for_snps);
  
	char output_filename_base[FILENAME_MAX];
	char filename_without_directory[FILENAME_MAX];
	strip_directory_from_filename(filename, filename_without_directory);
	strncpy(output_filename_base, filename_without_directory, FILENAME_MAX);
  
	if(output_filename != NULL && *output_filename != '\0')
	{
		strncpy(output_filename_base, output_filename, FILENAME_MAX);
	}

	if(output_vcf_file)
	{
		char vcf_output_filename[FILENAME_MAX];
		strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(vcf_output_filename, ".vcf");
		}
		
	  create_vcf_file(vcf_output_filename, get_snp_locations(), get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), get_length_of_genome(), get_pseudo_reference_sequence());
  }

  
  if(output_phylip_file)
  {
		char phylip_output_filename[FILENAME_MAX];
		strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(phylip_output_filename, ".phylip");
		}
	  create_phylib_of_snp_sites(phylip_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations());
  }

  if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0))
  {
		char multi_fasta_output_filename[FILENAME_MAX];
		strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(multi_fasta_output_filename, ".snp_sites.aln");
		}
	  create_fasta_of_snp_sites(multi_fasta_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations());
  }

  // free memory
	free(get_snp_locations());
	for(i = 0; i < get_number_of_samples(); i++)
	{
	//	free(get_sequence_names().[i]);
	}
	for(i = 0; i < get_number_of_snps(); i++)
	{
		free(bases_for_snps[i]);
	}
  free(get_pseudo_reference_sequence());

	return 1;
}
예제 #3
0
void extract_sequences(char vcf_filename[], char tree_filename[],char multi_fasta_filename[],int min_snps, char original_multi_fasta_filename[])
{
	FILE *vcf_file_pointer;
	vcf_file_pointer=fopen(vcf_filename, "r");
	
	newick_node* root_node;
	int number_of_snps;
	int number_of_columns;
	int i;
	int length_of_original_genome;
	length_of_original_genome = genome_length(original_multi_fasta_filename);	
	
	number_of_columns = get_number_of_columns_from_file(vcf_file_pointer);
	char* column_names[number_of_columns];
	for(i = 0; i < number_of_columns; i++)
	{
		column_names[i] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char));
	}
	get_column_names(vcf_file_pointer, column_names, number_of_columns);
	
	number_of_snps  = number_of_snps_in_phylip();
	
	int snp_locations[number_of_snps];
	
	get_integers_from_column_in_vcf(vcf_file_pointer, snp_locations, number_of_snps, column_number_for_column_name(column_names, "POS", number_of_columns));

	root_node = build_newick_tree(tree_filename, vcf_file_pointer,snp_locations, number_of_snps, column_names, number_of_columns, length_of_original_genome,min_snps);
	fclose(vcf_file_pointer);

	int filtered_snp_locations[number_of_snps];
	int number_of_filtered_snps;
	int number_of_samples = number_of_samples_from_parse_phylip();

	char * sample_names[number_of_samples];
  get_sample_names_from_parse_phylip(sample_names);

  char * reference_sequence_bases;
  reference_sequence_bases = (char *) calloc((number_of_snps+1),sizeof(char));

	get_sequence_for_sample_name(reference_sequence_bases, sample_names[0]);
	int internal_nodes[number_of_samples];
	int a = 0;
	for(a =0; a < number_of_samples; a++)
	{
		internal_nodes[a] = get_internal_node(a);
	}

	number_of_filtered_snps = refilter_existing_snps(reference_sequence_bases, number_of_snps, snp_locations, filtered_snp_locations,internal_nodes);
	char * filtered_bases_for_snps[number_of_filtered_snps];

	filter_sequence_bases_and_rotate(reference_sequence_bases, filtered_bases_for_snps, number_of_filtered_snps);
	
	create_phylip_of_snp_sites(tree_filename, number_of_filtered_snps, filtered_bases_for_snps, sample_names, number_of_samples,internal_nodes);
	create_vcf_file(tree_filename, filtered_snp_locations, number_of_filtered_snps, filtered_bases_for_snps, sample_names, number_of_samples,internal_nodes,0);
	create_fasta_of_snp_sites(tree_filename, number_of_filtered_snps, filtered_bases_for_snps, sample_names, number_of_samples,internal_nodes);
	
	// Create an new tree with updated distances
	scale_branch_distances(root_node, number_of_filtered_snps);

	FILE *output_tree_pointer;
	output_tree_pointer=fopen(tree_filename, "w");
	print_tree(root_node,output_tree_pointer);
	fprintf(output_tree_pointer,";");
	fflush(output_tree_pointer);
	fclose(output_tree_pointer);
	
	
	// Theres a seg fault in here
	for(i = 0; i < number_of_columns; i++)
	{
		free(column_names[i] );
	}
	
	for(i=0; i<number_of_samples; i++ )
	{
		free(sample_names[i]);
	}
	
	for(i=0; i<number_of_filtered_snps; i++ )
	{
		free(filtered_bases_for_snps[i]);
	}
	cleanup_node_memory(root_node);
	seqFreeAll();
	free(reference_sequence_bases);
}