InStruct.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include <omp.h>
#include <unistd.h>
#include <sys/types.h>
#include <limits.h>

#include "nrutil.h"
#include "random.h"
#include "data_interface.h"
#include "initial.h"
#include "check_converg.h"
#include "mcmc.h"
#include "result_analysis.h"
#include "quantile.h"
#include "poly_geno.h"


#define MAXLINE 10000
#define MAXLEN 100

static void param_decomp(int, char **);
static void printinfo(char *, int, char **,SEQDATA);
static void inf_K_val(char *outfilename, int n_small, int n_large, SEQDATA *data,INIT initial);
static double mem_cal(SEQDATA data,INIT initial);

double siglevel=0.900;
int nloci=100;
int popnum=2;
int totalsize=100;
int ploid=2;
long updatenum=1000000;
long burnin=500000;
int thinning=10;
int ckrep=20;
int GR_flag=1;
int chainnum=2;
char *missingdata="-9";
char *datafilename=NULL;
char *outfilename=NULL;
char *initialfilename=NULL;
char *convgfilename=NULL;
int label=1;
int popdata=1;
int prior_flag=0;	//which prior for selfing rates, uniform (0) or DPM (1) Dirichlet Process prior			
double alpha_dpm=10;
int back_refl=1;
int type_freq=1;	//indicate which way to calculate genotype frequency,expectation way or structure way
int nstep_check_empty_cluster=20;
int n_extra_col=0;
int markername_flag=0;
int print_iter=1; //print the updated information of each iteration, 1=yes, 0=no
int print_freq=0; //print the allele frequencies to output file, 1=yes,0=no
int n_small=1;
int n_large=0;
int inf_K=0;	//infer the number of subpopulations (1) or not (0)
int distr_fmt=1; //output follows Distruct format (1) or not (0)
int autopoly=1; //auto-tetraploid=1; allo-tetraploid=0
int data_fmt=0; //data_fmt=0 means one haploid per line;data_fmt=1 means one individual per line; 
double max_mem=1.0e9;
int mode=1;	//indicate which mode is used: 
			//infer	population substructure without admixture only (0)
			//infer population substructure only (1);
			//infer substructure and selfing rates for populations (2);
			//infer substructure and selfing rates for individuals (3);
			//infer substructure and inbreeding coefficients for populations (4);
			//infer substructure and inbreeding coefficients for individuals (5);
			
			






/* 
 * This application estimates the selfing rates for subpopulations and
 * classfies each individual into subpopulations given the sequence data.
 *
 * Synopsis:
 *		InStruct -d data_file -o output_file [-i initial_file] 
 *		[-K population number] [-L loci number] [-N total individual number]
 *		[-p ploid] [-u iteration number] [-b burn-in number] [-m missingdata]
 *		[-t thinning] [-c chain number] [-s seed1 seed2 seed3] [-sl significance level]
 *		[-lb label] [-a popdata] [-g GR_flag] [-r ckrep] [-f prior_flag] [-v mode]
 *		[-h alpha_dpm] [-e back_refl] [-y type_freq] [-j nstep_check_empty_cluster]
 *		[-x extra_columns] [-w markername] [-cf convgfilename]  [-pi print_iter]
 *		[-pf print_freq] [-ik inf_K] [-kv n_small n_large] [-df distr_fmt] [-ap autopoly]
 *		[-af data_fmt] [-mm max_mem]
 *
 * Parameters:
 * -d data_file - name of data file
 * -o output_file - name of output file
 * -i initial_file - name of initial file
 * -K popnum - subpopulation number
 * -L locinum - totoal loci number
 * -N totalsize - total individual number
 * -p ploid -  the number of haplotype in a genome
 * -u update - MCMC iteration number
 * -b burnin - MCMC burn-in number
 * -t thinning - MCMC thinning interval length
 * -c chainnum - the number of MCMC chains
 * -s seed1 seed2 seed3 three integers to override the default seed selection 
 * -m an integer represents missing data 
 * -sl significant level for the confidence intervals in result_analysis.c
 * -lb label boolean indicates whether data_file contains labels
 *     for individuals, 1=yes, 0=no
 * -a popdata boolean indicates whether data_file contains a column 
 *    about the original population information, 1=yes, 0=no
 * -g GR_flag boolean indicates whether Gelman_Rudin statistic 
 *    is used to check convergence,1=yes, 0=no
 * -r ckrep integer indicates how many stored iterations after 
 *    burn-in are used in convergence checking
 * -f prior_flag boolean indicates which prior for selfing rates, 
 *    uniform (0) or DPM (1) Dirichlet Process prior
 * -y boolean indicate which way to calculate genotype frequency,
 *    expectation way (0) or structure way (1)
 * -e boolean indicate which proposal method for selfing rates, 
 *    adaptive independence sampler(0) or back-reflection (1)
 * -h alpha_dpm  the scaling parameter "alpha" in Dirichlet Process Mixture model
 * -j nstep_check_empty_cluster the number of iterations after burn-in 
 *    that will be used to determine the existence of empty clusters
 * -x extra_columns integer indicates the number of extra columns 
 *    in data file besides label and popdata coloumns
 * -w markernames boolean indicates existence of marker name 
 *    line at the beginning of data file:
 *    no marker name line (0) or marker name line exist (1)
 * -pi indicates whether to print the updated information of each iteration,
 *     etc. log-Likelihood. 1=yes, 0=no
 * -cf convgfilename -name of the file storing updated values
 *     for convergence assessment
 * -pf print_freq indicates whether to print the result of allele 
 *     frequencies to output file, 1=yes, 0=no
 * -ik inf_K indicates whether inferring the number of subpopulations or not
 * -kv  n_small n_large indicates the lower and upper boundary for value of K
 * -df distr_fmt indicates whether to use the Distruct format for output (1) or not (0)
 * -ap autopoly indicates whether the species is autopolyploid (1) or allopolyploid (0)
 * -af data_fmt=0 means one haploid per line;data_fmt=1 means one individual per line;
 * -mm
 * -v mode integer indicate which mode is used: 
 *	  infer	population substructure without admixture only (0);
 *    the rest options infer population structure with admixture:
 *	  infer population substructure only (1);
 *	  infer substructure and selfing rates for populations (2);
 *	  infer substructure and selfing rates for individuals (3);
 *	  infer substructure and inbreeding coefficients for populations (4);
 *	  infer substructure and inbreeding coefficients for individuals (5);
 *
 * Example:
 * ./InStruct -d example.str -o exampleout.txt -N 50 -L 50 -lb 1 -a 1 -w 1 -mm 3.0e9 -v 0 -ik 1 -kv 1 7 -c 1 -u 10000 -b 1250 -t 25 -r 250 -cf exampleconv.txt
 * ./InStruct -d example.str -o exampleout.txt -N 50 -L 50 -lb 1 -a 1 -w 1 -mm 3.0e9 -v 0 -ik 1 -kv 1 7 -c 1 -u 10000 -b 1250 -t 25 -g 0
*/
 
 
 
 
int main(int argc, char **argv)
{
	SEQDATA seqdata;
	INIT initial;
	CHAIN chain;
	CONVG cvg;	
 	int chn;
	
	param_decomp(argc,argv);
		
	seqdata=read_data(datafilename,ploid,totalsize,popnum,nloci,missingdata,label,popdata,siglevel,back_refl,type_freq,nstep_check_empty_cluster,prior_flag,mode,n_extra_col,markername_flag,alpha_dpm,print_iter,print_freq,inf_K,distr_fmt,autopoly,data_fmt,max_mem);

	if(inf_K==0)
	{	
		initial=read_init(initialfilename,chainnum,popnum,updatenum,burnin,thinning);
	}
	else{
		initial=read_init(initialfilename,chainnum,n_large,updatenum,burnin,thinning);
	}
	if(mem_cal(seqdata,initial)>seqdata.max_mem)
	{	nrerror("Your request of memory exceeds the maximum memory allowed! Please change the parameter max_mem");}
	
	printinfo(outfilename,argc,argv,seqdata);
	
	if(inf_K==1)
	{		// make inference of K
		inf_K_val(outfilename,n_small,n_large,&seqdata,initial);
	}
	else{
		if(GR_flag==1) allocate_convg(seqdata,&cvg,chainnum,ckrep,convgfilename);	
		for(chn=0;chn<chainnum;chn++)					
		{	
			chain=mcmc_updating(seqdata,initial,chn,&cvg);
			if(chain.flag_empty_cluster==1)
			{				
				free_chain(&chain,seqdata);
				chn--;
				continue;
			}
			chain_stat(outfilename,chain,seqdata,chn);
			free_chain(&chain,seqdata);
		}
		if(GR_flag==1) 
		{	
			chain_converg(outfilename,&cvg);
			free_convg(&cvg);
		}		
	}
	fprintf(stdout,"THE JOB IS SUCCESSFULLY FINISHED\n");
	return(0); 
}      

double mem_cal(SEQDATA data,INIT initial)
{
	double temp=0;
	int K=0;
	if(inf_K==0)	{	K=data.popnum;}
	else{	if(inf_K==1)	{	K=n_large;}}
	if(data.print_freq==1)	temp+=(double)(8*K*data.locinum*data.allelenum_max);
	temp+=(double)(8+8*data.totalsize);
	switch(data.mode)	
	{
		case 2: temp+=(double)(K*8);break;
		case 3: temp+=(double)(data.totalsize*8);break;
		case 4: temp+=(double)(K*8);break;
		case 5: temp+=(double)(data.totalsize*8); break;
	}
	temp+=(double)(data.totalsize*4);
	temp+=(double)(8*data.totalsize*K);
	temp*=(double)((initial.update-initial.burnin)/initial.thinning);
	fprintf(stdout,"The memory required for this run is %f \n",temp);
	fprintf(stdout,"The maximum memory allowed is %f \n",data.max_mem); 
	return(temp);
}
  

void param_decomp(int argc, char ** argv)
/*
 * Function param_decomp decomposites the commandline arguments
 */
{
	int i,j;
	long *seeds,seednum=3;
	char *msg;
	
	seeds=lvector(0,seednum-1);
	
	msg="Synopsis:\n\tInStruct -d data_file -o output_file [-i initial_file] [-K population number] [-L loci number] [-N total individual number] [-p ploid] [-u iteration number] [-b burn-in number] [-m missingdata] [-t thinning] [-c chain number] [-s seed1 seed2 seed3] [-sl significance level] [-lb label] [-a popdata] [-g GR_flag] [-r ckrep] [-f prior_flag] [-v mode] [-h alpha_dpm] [-e back_refl] [-y type_freq] [-j nstep_check_empty_cluster] [-x extra_columns] [-w markername] [-cf convgfilename] [-pi print_iter] [-pf print_freq]  [-ik inf_K] [-kv n_small n_large] [-df distr_fmt] [-ap autopoly] [-af data_fmt] [-mm max_mem]\n";
	if(argc==2&&strcmp(argv[1],"-h")==0)		/*print help message*/
	{
		fprintf(stdout,"%s",msg);
		exit(1);
	}
	else{
		if(argc<5)								/* partition the commandline arguments*/
		{	nrerror("Too few arguments in the command line!");}
		else{
			for(i=1;i<argc;i++)
			{
				if(strcmp(argv[i],"-d")==0)
				{								/*-d means to assign data file name*/
					datafilename=argv[i+1];        			
					continue;
				}
				if(strcmp(argv[i],"-o")==0)
				{								/*-o means to assign output file name*/
					outfilename=argv[i+1];        			
					continue;
				}
				if(strcmp(argv[i],"-i")==0)
				{								/*-i means to assign initial file name*/
					initialfilename=argv[i+1];        			
					continue;
				}
				if(strcmp(argv[i],"-cf")==0)
				{								/*-cf means to assign initial file name*/
					convgfilename=argv[i+1];        			
					continue;
				}
				if(strcmp(argv[i],"-L")==0)
				{								/*-L means to reassign the loci number a new value*/
					nloci=atoi(argv[i+1]);        			
					continue;
				}        		
				if(strcmp(argv[i],"-N")==0)
				{								/*-N means to reset the number of total individuals*/
					totalsize=atoi(argv[i+1]);     			
					continue;
				}        		
				if(strcmp(argv[i],"-K")==0)
				{								/*-K means to reassign population number a new value*/
					popnum=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-p")==0)
				{								/*-p means to reset the number of haplotype in a genome*/
					ploid=atoi(argv[i+1]);    	/*for diploid, ploid=2*/		
					continue;
				}
				if(strcmp(argv[i],"-u")==0)
				{								/*-u means to reset the number of update steps of MCMC*/
					updatenum=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-b")==0)
				{								/*-b means to reassign burnin number a new value*/
					burnin=atoi(argv[i+1]); 
					if(burnin==0)	
					{	nrerror("Burn-in should not be zero!");}   			
					continue;
				}
				if(strcmp(argv[i],"-t")==0)
				{								/*-t means to reassign thinning number a new value  */
					thinning=atoi(argv[i+1]);   /*thinning is to take iterations at an even interval*/ 			
					continue;					/*which can reduces the autocorrelation between iterations*/											
				}								/*and thinning can also reduces the memory needed*/
				if(strcmp(argv[i],"-c")==0)
				{								/*-c means to reassign thinning number a new value*/
					chainnum=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-m")==0)
				{								/*-m means to reset the number that represents missing data*/
					missingdata=argv[i+1];    			
					continue;
				}
				if(strcmp(argv[i],"-lb")==0)
				{								/*-lb indicates whether data_file contains labels for individuals, 1=yes, 0=no*/
					label=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-a")==0)
				{								/*-a indicates whether data_file contains a column about the original population information, 1=yes, 0=no*/
					popdata=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-g")==0)
				{								/*-g indicates whether the  Gelman_Rudin statistic is used to check convergence,1=yes, 0=no*/
					GR_flag=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-f")==0)
				{								/*-f indicates which prior is used for selfing rates, 0=uniform,1=normal,2=DPM*/
					prior_flag=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-v")==0)
				{								/*-v indicates whether selfing rates are wrt. pop (0) or individuals (1)*/
					mode=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-r")==0)
				{								/*-r indicates how many stored iterations after burn-in are used in convergence checking*/
					ckrep=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-e")==0)
				{								/*-e indicates which proposal method for selfing rates, adaptive independence sampler(0) or back-reflection (1)*/
					back_refl=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-y")==0)
				{								/*-y indicates which way to calculate genotype frequency, expectation way (0) or structure way (1)*/
					type_freq=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-x")==0)
				{								/*-x indicates the number of extra columns in data file*/
					n_extra_col=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-pi")==0)
				{								/*-pi indicates whether to print the information of each iteration along MCMC running*/
					print_iter=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-ap")==0)
				{								/*-ap indicates whether the species is autopolyploid (1) or allopolyploid (0) */
					autopoly=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-pf")==0)
				{								/*-pf indicates whether to print the result of allele frequencies to output file*/
					print_freq=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-w")==0)
				{								/*-w indicates existence of marker name line*/
					markername_flag=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-af")==0)
				{								/*-af indicates which format of input file is used*/
					data_fmt=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-mm")==0)
				{								/*-mm indicates maximum memory allowed*/
					max_mem=atof(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-ik")==0)
				{								/*-ik indicates whether inferring the number of subpopulations or not*/
					inf_K=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-kv")==0)
				{								/*-kv indicates the lower and upper boundary for value of K*/
					n_small=atoi(argv[i+1]); 
					n_large=atoi(argv[i+2]);   			
					continue;
				}
				if(strcmp(argv[i],"-df")==0)
				{								/*-df indicates whether to use the Distruct format for output (1) or not (0)*/
					distr_fmt=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-sl")==0)
				{								/*-sl means to reset the significance level*/
					siglevel=atof(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-h")==0)
				{								/*-h means to reset the spread alpha in Dirichlet Process Mixture model*/
					alpha_dpm=atof(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-j")==0)
				{								/*-j means to reset the number of iterations after burn-in that will be used to determine the existence of empty clusters*/
					nstep_check_empty_cluster=atoi(argv[i+1]);    			
					continue;
				}
				if(strcmp(argv[i],"-s")==0)
				{	
					for(j=0;j<seednum;j++)		/*-s means to reset seeds for the random number generator*/
					{
						seeds[j]=atoi(argv[i+j+1]);
					}
					setseeds(seeds[0],seeds[1],seeds[2]);  			
					continue;
				}
			}
		}
	}
	
	if(ckrep>((updatenum-burnin)/thinning))
	{
		nrerror("The number of iterations for convergence assessment is greater than the total number of retained iterations from MCMC.");
	}
	if(nstep_check_empty_cluster>((updatenum-burnin)/thinning))
	{
		nrerror("The number of iterations for checking the existence of empty cluster is greater than the total number of retained iterations from MCMC.");
	}
	free_lvector(seeds,0,seednum-1);
}



void printinfo(char *outfilename,int argc, char **argv,SEQDATA data)
/*
 * Print the basic information of the running condition
 */
{
	int i;
	FILE *outfile;	  
	if((outfile=fopen(outfilename,"w"))==NULL)
	{	nrerror("Cannot open output file!");}		
	
	//print to output file
	fprintf(outfile,"\n");
	for(i=0;i<MAXLEN;i++)
		fprintf(outfile,"=");
	fprintf(outfile,"\n\tInStruct by Gao, Williamson and Bustamante (2007)\n");
	fprintf(outfile,"\t\t  Code by Hong Gao\n");
	fprintf(outfile,"\t\tVersion 1.0 (May. 2007)\n");
	for(i=0;i<MAXLEN;i++)
		fprintf(outfile,"=");
	
	fprintf(outfile,"\n\n\n\nCommand line arguments:\n    ");
	for(i=0;i<argc;i++)  
		fprintf(outfile,"%s ",argv[i]);	                
	fprintf(outfile,"\n\n"); 
	fprintf(outfile,"Data File:   %s\n",datafilename);  
	if(initialfilename!=NULL) fprintf(outfile,"Initial File:   %s\n",initialfilename); 
	fprintf(outfile,"Output File:   %s\n\n",outfilename); 
	 
	fprintf(outfile,"\nRun parameters:\n");
	fprintf(outfile,"    Chain Number=%d\n",chainnum);          
	fprintf(outfile,"    MCMC Iterations Number=%ld\n",updatenum);
	fprintf(outfile,"    Burn-in=%ld\n",burnin);                                             	
	fprintf(outfile,"    Thinning=%d\n",thinning);              
	fprintf(outfile,"    Ploid=%d\n",data.ploid); 
	if(data.ploid>2) 
	{	if(data.autopoly==1)	fprintf(outfile,"Autopolyploid assumed\n");
		else{	if(data.autopoly==0)	fprintf(outfile,"Allopolyploid assumed\n");}
	}                   
	fprintf(outfile,"    Missing Data=%s\n",data.missingdata); 
	fprintf(outfile,"    Population size=%d\n",data.totalsize);
	fprintf(outfile,"    Number of loci=%d\n",data.locinum);
	fprintf(outfile,"    Population number assumed=%d\n",data.popnum);
	fprintf(outfile,"    Significance level for Posterior Credible Interval=%f\n",data.siglevel);
	fprintf(outfile,"    Mode = ");
	if(data.ploid==2)
	{
		switch(data.mode)
		{	
		case 0: fprintf(outfile,"Make inference of population structure only without admixture.\n");break;
		case 1: fprintf(outfile,"Make inference of population structure only with admixture.\n");break;
		case 2: fprintf(outfile,"Make inference of population structure and the selfing rates for subpopulations.\n");break;
		case 3: fprintf(outfile,"Make inference of population structure and the selfing rates for individuals.\n");break;
		case 4: fprintf(outfile,"Make inference of population structure and the inbreeding coefficients for subpopulations.\n");break;
		case 5: fprintf(outfile,"Make inference of population structure and the inbreeding coefficients for individuals.\n");break;
		}
	}
	else{
		if(data.ploid==4)
		{	fprintf(outfile,"Make inference of population structure and the selfing rates for subpopulations.\n");}
	}
	
	if(data.inf_K==1)
	{	fprintf(outfile,"\nMake inference of the number of subpopulations.\n");}
	if(data.mode==3||data.mode==5)
	{
		if(data.prior_flag==0)  fprintf(outfile,"The Uniform prior is used for selfing rates.\n\n");
		if(data.prior_flag==1)  fprintf(outfile,"The Dirichlet Process prior is used for selfing rates and the scaling parameter is %f.\n\n",alpha_dpm);
	}
	switch(data.back_refl)
	{
		case 0: fprintf(outfile,"The proposal method for selfing rates is adaptive independence sampler.\n");break;
		case 1: fprintf(outfile,"The proposal method for selfing rates is back-reflection.\n"); break;
	}
	if(data.print_freq==1)
	{	fprintf(outfile,"The posterior allele frequencies will also be summarized and written to output file.\n");}
	
	if(GR_flag==1) fprintf(outfile,"The %d stored iteration results after burn-in will be used to calculate the GR statistic.\n",ckrep);
	if(data.distr_fmt==1)
	{	fprintf(outfile,"The output of Q are generated in the Distruct format.\n");}
	fclose(outfile);
	
} 
   


void inf_K_val(char *outfilename, int n_small, int n_large, SEQDATA *data, INIT initial)
{


	//flag=0;
	int parallelism_enabled = 1; //0=no, not 0 = yes
	int pid = getpid();
	#pragma omp parallel if(parallelism_enabled) 
	{

	int K,chn,K_best,K_num;//flag,,temp;
	double **dic,*val_K;
	CONVG cvg;
	FILE *outfile;
	CHAIN chain;

	if(n_large<1||n_small<1||n_small>n_large)
	{
		n_small=1;
		n_large=(int)pow((double)data->totalsize,0.3)+1;
		fprintf(stdout,"The range of value for K is not correct! Change to default value (%d - %d)!\n",n_small,n_large);			
	}
	K_num=n_large-n_small+1;
	dic=dmatrix(0,K_num-1,0,initial.chainnum-1);
	val_K=dvector(0,K_num-1);

	/*//flag=0;
	int parallelism_enabled = 1; //0=no, not 0 = yes

	#pragma omp parallel if(parallelism_enabled)
	{*/
		//initialize private variables
		char foutfilename[strlen(outfilename)];
		
		#pragma omp for

		for(K=n_small;K<=n_large;K++)
		{
			//modify outfilename so each K produces its own output
			strcpy(foutfilename,outfilename); //start with original outfilename

			//int K to char pK
			int ndigits = floor(log10(K)) + 1;
			char pK[ndigits]; //char array to contain int, sized to length of K
			sprintf(pK, "%d", K); //convert integer K into a char array
			
			//int pid to char pid
			char ppid[(sizeof(int)*CHAR_BIT-1)/3 + 3];
			sprintf(ppid, "%d", pid); //convert integer pid into a char array

			char pp[2] = ".";
			strncat(foutfilename, pp, 1); //add terminal "."
			strncat(foutfilename, ppid, strlen(ppid)); //add process id
			strncat(foutfilename, pp, 1); //add "."
			strncat(foutfilename, pK, strlen(pK)); //add "K"

			int tid = omp_get_thread_num();
			printf("t%d,K%d,pK%s,pp%s,out(%s),fout(%s),pid(%d)\n",tid,K,pK,pp,outfilename,foutfilename,pid);
			//getchar();




			data->popnum=K;
			
			printf("t%d,data.popnum%d\n",tid,(*data).popnum);
			
			if((outfile=fopen(foutfilename,"a+"))==NULL)
			{	nrerror("Cannot open output file!");}
			fprintf(outfile,"\n\nThe current K is %d\n",K);


			fclose(outfile);
			if(GR_flag==1) allocate_convg(*data,&cvg,chainnum,ckrep,convgfilename);
			for(chn=0;chn<initial.chainnum;chn++)					
			{	
				chain=mcmc_updating((*data),initial,chn,&cvg);
				dic[K-n_small][chn]=chain_stat(foutfilename,chain,*data,chn);
				free_chain(&chain,(*data));
			}
			if(GR_flag==1) 
			{	
				//temp=
				chain_converg(foutfilename,&cvg);
				free_convg(&cvg);
			}
			//if(temp==1) //bad convergence or only one chain
			//{	flag=1;} 
		}
	} //end omp parallel if

	//COMBINE SEPARATE OUTPUT FILES
	printf("combining files.\n");
	char foutfilename[strlen(outfilename)];
	int K;
	for(K=n_small;K<=n_large;K++)
	{
		//modify outfilename so each K produces its own output
		strcpy(foutfilename,outfilename); //start with original outfilename

		//int K to char pK
		int ndigits = floor(log10(K)) + 1;
		char pK[ndigits]; //char array to contain int, sized to length of K
		sprintf(pK, "%d", K); //convert integer K into a char array
			
		//int pid to char pid
		char ppid[(sizeof(int)*CHAR_BIT-1)/3 + 3];
		sprintf(ppid, "%d", pid); //convert integer pid into a char array

		char pp[2] = ".";
		strncat(foutfilename, pp, 1); //add terminal "."
		strncat(foutfilename, ppid, strlen(ppid)); //add process id
		strncat(foutfilename, pp, 1); //add "."
		strncat(foutfilename, pK, strlen(pK)); //add "K"

		//open the base file (head) and the file to append (tail)
		FILE *head = fopen(outfilename, "ab");
		FILE *tail = fopen(foutfilename, "rb");
		if (!head || !tail) abort();
		
		//append
		char buf[1024];
		size_t n;
    	while ((n = fread(buf, 1, sizeof buf, tail)) > 0)
        if (fwrite(buf, 1, n, head) != n) abort();
    	if (ferror(tail)) abort();
    	
    	//close
    	fclose(head);
    	fclose(tail); 
    	
    	//delete the file just appended (to do)
	}

	
	
	
	/*
	//CALCULATE BEST K USING DIC
	//if(flag==1) //use the minimum DIC
	//{
		for(K=0;K<K_num;K++)
		{
			val_K[K]=dic[K][find_min(dic[K],initial.chainnum)];
		}
		K_best=find_min(val_K,K_num)+n_small;
	//}
	if((outfile=fopen(outfilename,"a+"))==NULL)
	{	nrerror("Cannot open output file!");}
	fprintf(outfile,"\n\nThe range of value for K is (%d - %d)!\n",n_small,n_large);			
	fprintf(outfile,"The optimal K is %d\n",K_best);
	fclose(outfile);
	free_dmatrix(dic,0,K_num-1,0,initial.chainnum-1);
	free_dvector(val_K,0,K_num-1);
	*/

}