Exemplo n.º 1
0
static int generate_snp_sites_generic(char filename[],
                                      int output_multi_fasta_file,
                                      int output_vcf_file,
                                      int output_phylip_file,
                                      char output_filename[],
                                      int output_reference, int pure_mode, int output_monomorphic)
{
	int i;
	detect_snps(filename, pure_mode, output_monomorphic);

  bases_for_snps =  calloc(get_number_of_snps()+1, sizeof(char*));
  
	for(i = 0; i < get_number_of_snps(); i++)
	{
		bases_for_snps[i] = calloc(get_number_of_samples()+1, sizeof(char));
	}
  
	get_bases_for_each_snp(filename, bases_for_snps);
  
	char output_filename_base[FILENAME_MAX];
	char filename_without_directory[FILENAME_MAX];
	strip_directory_from_filename(filename, filename_without_directory);
	strncpy(output_filename_base, filename_without_directory, FILENAME_MAX);
  
	if(output_filename != NULL && *output_filename != '\0')
	{
		strncpy(output_filename_base, output_filename, FILENAME_MAX);
	}

	if(output_vcf_file)
	{
		char vcf_output_filename[FILENAME_MAX];
		strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(vcf_output_filename, ".vcf");
		}
		
	  create_vcf_file(vcf_output_filename, get_snp_locations(), get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), get_length_of_genome(), get_pseudo_reference_sequence());
  }

  
  if(output_phylip_file)
  {
		char phylip_output_filename[FILENAME_MAX];
		strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(phylip_output_filename, ".phylip");
		}
	  create_phylib_of_snp_sites(phylip_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations());
  }

  if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0))
  {
		char multi_fasta_output_filename[FILENAME_MAX];
		strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(multi_fasta_output_filename, ".snp_sites.aln");
		}
	  create_fasta_of_snp_sites(multi_fasta_output_filename, get_number_of_snps(), bases_for_snps, get_sequence_names(), get_number_of_samples(), output_reference, get_pseudo_reference_sequence(),get_snp_locations());
  }

  // free memory
	free(get_snp_locations());
	for(i = 0; i < get_number_of_samples(); i++)
	{
	//	free(get_sequence_names().[i]);
	}
	for(i = 0; i < get_number_of_snps(); i++)
	{
		free(bases_for_snps[i]);
	}
  free(get_pseudo_reference_sequence());

	return 1;
}
Exemplo n.º 2
0
int generate_snp_sites(char filename[],int output_multi_fasta_file, int output_vcf_file, int output_phylip_file, char output_filename[])
{
	size_t length_of_genome;
	char * reference_sequence;
	int number_of_snps;
	int * snp_locations;
	int number_of_samples;
	int i;
	
	length_of_genome = genome_length(filename);
	reference_sequence = (char *) calloc((length_of_genome +1),sizeof(char));
	
	build_reference_sequence(reference_sequence,filename);
	number_of_snps = detect_snps(reference_sequence, filename, length_of_genome);
	
	snp_locations = (int *) calloc((number_of_snps+1),sizeof(int));
	build_snp_locations(snp_locations, reference_sequence);
	free(reference_sequence);
	
	number_of_samples = number_of_sequences_in_file(filename);
	
	// Find out the names of the sequences
	char* sequence_names[number_of_samples];
	sequence_names[number_of_samples-1] = '\0';
	for(i = 0; i < number_of_samples; i++)
	{
		sequence_names[i] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char));
	}
	
	get_sample_names_for_header(filename, sequence_names, number_of_samples);
	
	char* bases_for_snps[number_of_snps];
	
	for(i = 0; i < number_of_snps; i++)
	{
		bases_for_snps[i] = calloc(number_of_samples+1 ,sizeof(char));
	}
	
	get_bases_for_each_snp(filename, snp_locations, bases_for_snps, length_of_genome, number_of_snps);
	
	char output_filename_base[FILENAME_MAX];
	char filename_without_directory[FILENAME_MAX];
	strip_directory_from_filename(filename, filename_without_directory);
	strncpy(output_filename_base, filename_without_directory, FILENAME_MAX);
	
	if(output_filename != NULL && *output_filename != '\0')
	{
		strncpy(output_filename_base, output_filename, FILENAME_MAX);
	}

	if(output_vcf_file)
	{
		char vcf_output_filename[FILENAME_MAX];
		strncpy(vcf_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(vcf_output_filename, ".vcf");
		}
		
	  create_vcf_file(vcf_output_filename, snp_locations, number_of_snps, bases_for_snps, sequence_names, number_of_samples, length_of_genome);
  }

  if(output_phylip_file)
  {
		char phylip_output_filename[FILENAME_MAX];
		strncpy(phylip_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(phylip_output_filename, ".phylip");
		}
	  create_phylib_of_snp_sites(phylip_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples);
  }

  if((output_multi_fasta_file) || (output_vcf_file ==0 && output_phylip_file == 0 && output_multi_fasta_file == 0))
  {
		char multi_fasta_output_filename[FILENAME_MAX];
		strncpy(multi_fasta_output_filename, output_filename_base, FILENAME_MAX);
		if((output_vcf_file + output_phylip_file + output_multi_fasta_file) > 1 || (output_filename == NULL || *output_filename == '\0') )
		{
			strcat(multi_fasta_output_filename, ".snp_sites.aln");
		}
	  create_fasta_of_snp_sites(multi_fasta_output_filename, number_of_snps, bases_for_snps, sequence_names, number_of_samples);
  }

  // free memory
	free(snp_locations);
	for(i = 0; i < number_of_samples; i++)
	{
		free(sequence_names[i]);
	}
	for(i = 0; i < number_of_snps; i++)
	{
		free(bases_for_snps[i]);
	}
	

	return 1;
}
Exemplo n.º 3
0
int main_searchvariants(int argc, char* argv[],char *server_url)
{
	int cmd;
	int i;
	search_variant_request *request=(search_variant_request*)malloc(1*sizeof(search_variant_request));	
	int size_variants = 0;
	int size_calls = 0;
	request->name = "null";
	request->pageToken ="null";
	char debug = 0;
	
	static struct option long_options[]={
		{"variantSetIds",required_argument,0,'v'},
		{"referenceName",required_argument,0,'r'},
		{"start",required_argument,0,'s'},
		{"end",required_argument,0,'e'},
		{"callSetIds",required_argument,0,'c'},
		{"variantName",required_argument,0,'n'},
		{"debug",no_argument,0,'d'},
		{0,0,0,0}
	};
	//if(argc < 2)
	while((cmd=getopt_long(argc,argv,"v:r:s:e:c:n:d",long_options,NULL))!=-1)
	{
		switch(cmd)
		{
			case 'v':
					if(optarg==NULL||(strcmp(optarg,"")==0))
						{
							error("--variantSetIds string can't be empty.\n");	
						}
					  else
						{
							size_variants = count_ids(optarg);
						 	request->variantSetIds = (char**)malloc(size_variants*sizeof(char*));
							set_ids(optarg,request->variantSetIds,size_variants);
						}
						break;
			case 'r':
					if(optarg==NULL||(strcmp(optarg,"")==0))
						{
							error("--referenceName string can't be empty.\n");				
						}
					  else
						{
							request->referenceName = optarg; 
						  	
						}
						break;
			case 's': request->start = atol(optarg);
					if(request->start < 0)
					{
					  	error("--start integer must be no negative.");
					}
					break;
			case 'e': request->end = atol(optarg); 
					if(request->end < 0)
					{
						perror("--end integer must be no negative.");
					}
					break;
			case 'c': if(optarg==NULL||(strcmp(optarg,"")==0))
						{
							size_calls = 0;
						}
					  else
					  	{
					  		size_calls = count_ids(optarg);
						  	request->callSetIds = (char**)malloc(size_calls*sizeof(char*));
						  	set_ids(optarg,request->callSetIds,size_calls);
					  	}
					break;
			case 'n': request->name = optarg; break;
			case 'd': debug = 1; break;
			case '?': usage();
			default: error("Unknown argument %s\n",optarg);
		}
	}
	
	if(size_variants==0)
	{
		usage();
	}
	
	start_user(server_url);
	char* vcf_file_name;
	//process each variantSet
	for(i=0; i<size_variants; i++)
	{
		
		vcf_file_name = get_variantSetId_vcf_name(request,i);
		create_vcf_file(vcf_file_name);	
		
		while(strcmp(request->pageToken,"NULL")!=0)
		{
			user->post_fields = create_request_string(request,i,size_calls);
			//printf("post field string: %s \n",user->post_fields);
			client_search_request(user,"variants");
			//printf("%s\n",user->response);
			write_vcf_file(user->response,vcf_file_name);
			request->pageToken = get_pageToken();
			//printf("%s \n",request->pageToken);
			if(debug)
			{
				printf("%s\n",user->response);
			}
		}
	}
	end_user();
	return 0;
}
Exemplo n.º 4
0
void extract_sequences(char vcf_filename[], char tree_filename[],char multi_fasta_filename[],int min_snps, char original_multi_fasta_filename[])
{
	FILE *vcf_file_pointer;
	vcf_file_pointer=fopen(vcf_filename, "r");
	
	newick_node* root_node;
	int number_of_snps;
	int number_of_columns;
	int i;
	int length_of_original_genome;
	length_of_original_genome = genome_length(original_multi_fasta_filename);	
	
	number_of_columns = get_number_of_columns_from_file(vcf_file_pointer);
	char* column_names[number_of_columns];
	for(i = 0; i < number_of_columns; i++)
	{
		column_names[i] = calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char));
	}
	get_column_names(vcf_file_pointer, column_names, number_of_columns);
	
	number_of_snps  = number_of_snps_in_phylip();
	
	int snp_locations[number_of_snps];
	
	get_integers_from_column_in_vcf(vcf_file_pointer, snp_locations, number_of_snps, column_number_for_column_name(column_names, "POS", number_of_columns));

	root_node = build_newick_tree(tree_filename, vcf_file_pointer,snp_locations, number_of_snps, column_names, number_of_columns, length_of_original_genome,min_snps);
	fclose(vcf_file_pointer);

	int filtered_snp_locations[number_of_snps];
	int number_of_filtered_snps;
	int number_of_samples = number_of_samples_from_parse_phylip();

	char * sample_names[number_of_samples];
  get_sample_names_from_parse_phylip(sample_names);

  char * reference_sequence_bases;
  reference_sequence_bases = (char *) calloc((number_of_snps+1),sizeof(char));

	get_sequence_for_sample_name(reference_sequence_bases, sample_names[0]);
	int internal_nodes[number_of_samples];
	int a = 0;
	for(a =0; a < number_of_samples; a++)
	{
		internal_nodes[a] = get_internal_node(a);
	}

	number_of_filtered_snps = refilter_existing_snps(reference_sequence_bases, number_of_snps, snp_locations, filtered_snp_locations,internal_nodes);
	char * filtered_bases_for_snps[number_of_filtered_snps];

	filter_sequence_bases_and_rotate(reference_sequence_bases, filtered_bases_for_snps, number_of_filtered_snps);
	
	create_phylip_of_snp_sites(tree_filename, number_of_filtered_snps, filtered_bases_for_snps, sample_names, number_of_samples,internal_nodes);
	create_vcf_file(tree_filename, filtered_snp_locations, number_of_filtered_snps, filtered_bases_for_snps, sample_names, number_of_samples,internal_nodes,0);
	create_fasta_of_snp_sites(tree_filename, number_of_filtered_snps, filtered_bases_for_snps, sample_names, number_of_samples,internal_nodes);
	
	// Create an new tree with updated distances
	scale_branch_distances(root_node, number_of_filtered_snps);

	FILE *output_tree_pointer;
	output_tree_pointer=fopen(tree_filename, "w");
	print_tree(root_node,output_tree_pointer);
	fprintf(output_tree_pointer,";");
	fflush(output_tree_pointer);
	fclose(output_tree_pointer);
	
	
	// Theres a seg fault in here
	for(i = 0; i < number_of_columns; i++)
	{
		free(column_names[i] );
	}
	
	for(i=0; i<number_of_samples; i++ )
	{
		free(sample_names[i]);
	}
	
	for(i=0; i<number_of_filtered_snps; i++ )
	{
		free(filtered_bases_for_snps[i]);
	}
	cleanup_node_memory(root_node);
	seqFreeAll();
	free(reference_sequence_bases);
}