/
InStruct.c
executable file
·704 lines (627 loc) · 25 KB
/
InStruct.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <omp.h>
#include <unistd.h>
#include <sys/types.h>
#include <limits.h>
#include "nrutil.h"
#include "random.h"
#include "data_interface.h"
#include "initial.h"
#include "check_converg.h"
#include "mcmc.h"
#include "result_analysis.h"
#include "quantile.h"
#include "poly_geno.h"
#define MAXLINE 10000
#define MAXLEN 100
static void param_decomp(int, char **);
static void printinfo(char *, int, char **,SEQDATA);
static void inf_K_val(char *outfilename, int n_small, int n_large, SEQDATA *data,INIT initial);
static double mem_cal(SEQDATA data,INIT initial);
double siglevel=0.900;
int nloci=100;
int popnum=2;
int totalsize=100;
int ploid=2;
long updatenum=1000000;
long burnin=500000;
int thinning=10;
int ckrep=20;
int GR_flag=1;
int chainnum=2;
char *missingdata="-9";
char *datafilename=NULL;
char *outfilename=NULL;
char *initialfilename=NULL;
char *convgfilename=NULL;
int label=1;
int popdata=1;
int prior_flag=0; //which prior for selfing rates, uniform (0) or DPM (1) Dirichlet Process prior
double alpha_dpm=10;
int back_refl=1;
int type_freq=1; //indicate which way to calculate genotype frequency,expectation way or structure way
int nstep_check_empty_cluster=20;
int n_extra_col=0;
int markername_flag=0;
int print_iter=1; //print the updated information of each iteration, 1=yes, 0=no
int print_freq=0; //print the allele frequencies to output file, 1=yes,0=no
int n_small=1;
int n_large=0;
int inf_K=0; //infer the number of subpopulations (1) or not (0)
int distr_fmt=1; //output follows Distruct format (1) or not (0)
int autopoly=1; //auto-tetraploid=1; allo-tetraploid=0
int data_fmt=0; //data_fmt=0 means one haploid per line;data_fmt=1 means one individual per line;
double max_mem=1.0e9;
int mode=1; //indicate which mode is used:
//infer population substructure without admixture only (0)
//infer population substructure only (1);
//infer substructure and selfing rates for populations (2);
//infer substructure and selfing rates for individuals (3);
//infer substructure and inbreeding coefficients for populations (4);
//infer substructure and inbreeding coefficients for individuals (5);
/*
* This application estimates the selfing rates for subpopulations and
* classfies each individual into subpopulations given the sequence data.
*
* Synopsis:
* InStruct -d data_file -o output_file [-i initial_file]
* [-K population number] [-L loci number] [-N total individual number]
* [-p ploid] [-u iteration number] [-b burn-in number] [-m missingdata]
* [-t thinning] [-c chain number] [-s seed1 seed2 seed3] [-sl significance level]
* [-lb label] [-a popdata] [-g GR_flag] [-r ckrep] [-f prior_flag] [-v mode]
* [-h alpha_dpm] [-e back_refl] [-y type_freq] [-j nstep_check_empty_cluster]
* [-x extra_columns] [-w markername] [-cf convgfilename] [-pi print_iter]
* [-pf print_freq] [-ik inf_K] [-kv n_small n_large] [-df distr_fmt] [-ap autopoly]
* [-af data_fmt] [-mm max_mem]
*
* Parameters:
* -d data_file - name of data file
* -o output_file - name of output file
* -i initial_file - name of initial file
* -K popnum - subpopulation number
* -L locinum - totoal loci number
* -N totalsize - total individual number
* -p ploid - the number of haplotype in a genome
* -u update - MCMC iteration number
* -b burnin - MCMC burn-in number
* -t thinning - MCMC thinning interval length
* -c chainnum - the number of MCMC chains
* -s seed1 seed2 seed3 three integers to override the default seed selection
* -m an integer represents missing data
* -sl significant level for the confidence intervals in result_analysis.c
* -lb label boolean indicates whether data_file contains labels
* for individuals, 1=yes, 0=no
* -a popdata boolean indicates whether data_file contains a column
* about the original population information, 1=yes, 0=no
* -g GR_flag boolean indicates whether Gelman_Rudin statistic
* is used to check convergence,1=yes, 0=no
* -r ckrep integer indicates how many stored iterations after
* burn-in are used in convergence checking
* -f prior_flag boolean indicates which prior for selfing rates,
* uniform (0) or DPM (1) Dirichlet Process prior
* -y boolean indicate which way to calculate genotype frequency,
* expectation way (0) or structure way (1)
* -e boolean indicate which proposal method for selfing rates,
* adaptive independence sampler(0) or back-reflection (1)
* -h alpha_dpm the scaling parameter "alpha" in Dirichlet Process Mixture model
* -j nstep_check_empty_cluster the number of iterations after burn-in
* that will be used to determine the existence of empty clusters
* -x extra_columns integer indicates the number of extra columns
* in data file besides label and popdata coloumns
* -w markernames boolean indicates existence of marker name
* line at the beginning of data file:
* no marker name line (0) or marker name line exist (1)
* -pi indicates whether to print the updated information of each iteration,
* etc. log-Likelihood. 1=yes, 0=no
* -cf convgfilename -name of the file storing updated values
* for convergence assessment
* -pf print_freq indicates whether to print the result of allele
* frequencies to output file, 1=yes, 0=no
* -ik inf_K indicates whether inferring the number of subpopulations or not
* -kv n_small n_large indicates the lower and upper boundary for value of K
* -df distr_fmt indicates whether to use the Distruct format for output (1) or not (0)
* -ap autopoly indicates whether the species is autopolyploid (1) or allopolyploid (0)
* -af data_fmt=0 means one haploid per line;data_fmt=1 means one individual per line;
* -mm
* -v mode integer indicate which mode is used:
* infer population substructure without admixture only (0);
* the rest options infer population structure with admixture:
* infer population substructure only (1);
* infer substructure and selfing rates for populations (2);
* infer substructure and selfing rates for individuals (3);
* infer substructure and inbreeding coefficients for populations (4);
* infer substructure and inbreeding coefficients for individuals (5);
*
* Example:
* ./InStruct -d example.str -o exampleout.txt -N 50 -L 50 -lb 1 -a 1 -w 1 -mm 3.0e9 -v 0 -ik 1 -kv 1 7 -c 1 -u 10000 -b 1250 -t 25 -r 250 -cf exampleconv.txt
* ./InStruct -d example.str -o exampleout.txt -N 50 -L 50 -lb 1 -a 1 -w 1 -mm 3.0e9 -v 0 -ik 1 -kv 1 7 -c 1 -u 10000 -b 1250 -t 25 -g 0
*/
int main(int argc, char **argv)
{
SEQDATA seqdata;
INIT initial;
CHAIN chain;
CONVG cvg;
int chn;
param_decomp(argc,argv);
seqdata=read_data(datafilename,ploid,totalsize,popnum,nloci,missingdata,label,popdata,siglevel,back_refl,type_freq,nstep_check_empty_cluster,prior_flag,mode,n_extra_col,markername_flag,alpha_dpm,print_iter,print_freq,inf_K,distr_fmt,autopoly,data_fmt,max_mem);
if(inf_K==0)
{
initial=read_init(initialfilename,chainnum,popnum,updatenum,burnin,thinning);
}
else{
initial=read_init(initialfilename,chainnum,n_large,updatenum,burnin,thinning);
}
if(mem_cal(seqdata,initial)>seqdata.max_mem)
{ nrerror("Your request of memory exceeds the maximum memory allowed! Please change the parameter max_mem");}
printinfo(outfilename,argc,argv,seqdata);
if(inf_K==1)
{ // make inference of K
inf_K_val(outfilename,n_small,n_large,&seqdata,initial);
}
else{
if(GR_flag==1) allocate_convg(seqdata,&cvg,chainnum,ckrep,convgfilename);
for(chn=0;chn<chainnum;chn++)
{
chain=mcmc_updating(seqdata,initial,chn,&cvg);
if(chain.flag_empty_cluster==1)
{
free_chain(&chain,seqdata);
chn--;
continue;
}
chain_stat(outfilename,chain,seqdata,chn);
free_chain(&chain,seqdata);
}
if(GR_flag==1)
{
chain_converg(outfilename,&cvg);
free_convg(&cvg);
}
}
fprintf(stdout,"THE JOB IS SUCCESSFULLY FINISHED\n");
return(0);
}
double mem_cal(SEQDATA data,INIT initial)
{
double temp=0;
int K=0;
if(inf_K==0) { K=data.popnum;}
else{ if(inf_K==1) { K=n_large;}}
if(data.print_freq==1) temp+=(double)(8*K*data.locinum*data.allelenum_max);
temp+=(double)(8+8*data.totalsize);
switch(data.mode)
{
case 2: temp+=(double)(K*8);break;
case 3: temp+=(double)(data.totalsize*8);break;
case 4: temp+=(double)(K*8);break;
case 5: temp+=(double)(data.totalsize*8); break;
}
temp+=(double)(data.totalsize*4);
temp+=(double)(8*data.totalsize*K);
temp*=(double)((initial.update-initial.burnin)/initial.thinning);
fprintf(stdout,"The memory required for this run is %f \n",temp);
fprintf(stdout,"The maximum memory allowed is %f \n",data.max_mem);
return(temp);
}
void param_decomp(int argc, char ** argv)
/*
* Function param_decomp decomposites the commandline arguments
*/
{
int i,j;
long *seeds,seednum=3;
char *msg;
seeds=lvector(0,seednum-1);
msg="Synopsis:\n\tInStruct -d data_file -o output_file [-i initial_file] [-K population number] [-L loci number] [-N total individual number] [-p ploid] [-u iteration number] [-b burn-in number] [-m missingdata] [-t thinning] [-c chain number] [-s seed1 seed2 seed3] [-sl significance level] [-lb label] [-a popdata] [-g GR_flag] [-r ckrep] [-f prior_flag] [-v mode] [-h alpha_dpm] [-e back_refl] [-y type_freq] [-j nstep_check_empty_cluster] [-x extra_columns] [-w markername] [-cf convgfilename] [-pi print_iter] [-pf print_freq] [-ik inf_K] [-kv n_small n_large] [-df distr_fmt] [-ap autopoly] [-af data_fmt] [-mm max_mem]\n";
if(argc==2&&strcmp(argv[1],"-h")==0) /*print help message*/
{
fprintf(stdout,"%s",msg);
exit(1);
}
else{
if(argc<5) /* partition the commandline arguments*/
{ nrerror("Too few arguments in the command line!");}
else{
for(i=1;i<argc;i++)
{
if(strcmp(argv[i],"-d")==0)
{ /*-d means to assign data file name*/
datafilename=argv[i+1];
continue;
}
if(strcmp(argv[i],"-o")==0)
{ /*-o means to assign output file name*/
outfilename=argv[i+1];
continue;
}
if(strcmp(argv[i],"-i")==0)
{ /*-i means to assign initial file name*/
initialfilename=argv[i+1];
continue;
}
if(strcmp(argv[i],"-cf")==0)
{ /*-cf means to assign initial file name*/
convgfilename=argv[i+1];
continue;
}
if(strcmp(argv[i],"-L")==0)
{ /*-L means to reassign the loci number a new value*/
nloci=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-N")==0)
{ /*-N means to reset the number of total individuals*/
totalsize=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-K")==0)
{ /*-K means to reassign population number a new value*/
popnum=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-p")==0)
{ /*-p means to reset the number of haplotype in a genome*/
ploid=atoi(argv[i+1]); /*for diploid, ploid=2*/
continue;
}
if(strcmp(argv[i],"-u")==0)
{ /*-u means to reset the number of update steps of MCMC*/
updatenum=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-b")==0)
{ /*-b means to reassign burnin number a new value*/
burnin=atoi(argv[i+1]);
if(burnin==0)
{ nrerror("Burn-in should not be zero!");}
continue;
}
if(strcmp(argv[i],"-t")==0)
{ /*-t means to reassign thinning number a new value */
thinning=atoi(argv[i+1]); /*thinning is to take iterations at an even interval*/
continue; /*which can reduces the autocorrelation between iterations*/
} /*and thinning can also reduces the memory needed*/
if(strcmp(argv[i],"-c")==0)
{ /*-c means to reassign thinning number a new value*/
chainnum=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-m")==0)
{ /*-m means to reset the number that represents missing data*/
missingdata=argv[i+1];
continue;
}
if(strcmp(argv[i],"-lb")==0)
{ /*-lb indicates whether data_file contains labels for individuals, 1=yes, 0=no*/
label=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-a")==0)
{ /*-a indicates whether data_file contains a column about the original population information, 1=yes, 0=no*/
popdata=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-g")==0)
{ /*-g indicates whether the Gelman_Rudin statistic is used to check convergence,1=yes, 0=no*/
GR_flag=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-f")==0)
{ /*-f indicates which prior is used for selfing rates, 0=uniform,1=normal,2=DPM*/
prior_flag=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-v")==0)
{ /*-v indicates whether selfing rates are wrt. pop (0) or individuals (1)*/
mode=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-r")==0)
{ /*-r indicates how many stored iterations after burn-in are used in convergence checking*/
ckrep=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-e")==0)
{ /*-e indicates which proposal method for selfing rates, adaptive independence sampler(0) or back-reflection (1)*/
back_refl=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-y")==0)
{ /*-y indicates which way to calculate genotype frequency, expectation way (0) or structure way (1)*/
type_freq=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-x")==0)
{ /*-x indicates the number of extra columns in data file*/
n_extra_col=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-pi")==0)
{ /*-pi indicates whether to print the information of each iteration along MCMC running*/
print_iter=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-ap")==0)
{ /*-ap indicates whether the species is autopolyploid (1) or allopolyploid (0) */
autopoly=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-pf")==0)
{ /*-pf indicates whether to print the result of allele frequencies to output file*/
print_freq=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-w")==0)
{ /*-w indicates existence of marker name line*/
markername_flag=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-af")==0)
{ /*-af indicates which format of input file is used*/
data_fmt=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-mm")==0)
{ /*-mm indicates maximum memory allowed*/
max_mem=atof(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-ik")==0)
{ /*-ik indicates whether inferring the number of subpopulations or not*/
inf_K=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-kv")==0)
{ /*-kv indicates the lower and upper boundary for value of K*/
n_small=atoi(argv[i+1]);
n_large=atoi(argv[i+2]);
continue;
}
if(strcmp(argv[i],"-df")==0)
{ /*-df indicates whether to use the Distruct format for output (1) or not (0)*/
distr_fmt=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-sl")==0)
{ /*-sl means to reset the significance level*/
siglevel=atof(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-h")==0)
{ /*-h means to reset the spread alpha in Dirichlet Process Mixture model*/
alpha_dpm=atof(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-j")==0)
{ /*-j means to reset the number of iterations after burn-in that will be used to determine the existence of empty clusters*/
nstep_check_empty_cluster=atoi(argv[i+1]);
continue;
}
if(strcmp(argv[i],"-s")==0)
{
for(j=0;j<seednum;j++) /*-s means to reset seeds for the random number generator*/
{
seeds[j]=atoi(argv[i+j+1]);
}
setseeds(seeds[0],seeds[1],seeds[2]);
continue;
}
}
}
}
if(ckrep>((updatenum-burnin)/thinning))
{
nrerror("The number of iterations for convergence assessment is greater than the total number of retained iterations from MCMC.");
}
if(nstep_check_empty_cluster>((updatenum-burnin)/thinning))
{
nrerror("The number of iterations for checking the existence of empty cluster is greater than the total number of retained iterations from MCMC.");
}
free_lvector(seeds,0,seednum-1);
}
void printinfo(char *outfilename,int argc, char **argv,SEQDATA data)
/*
* Print the basic information of the running condition
*/
{
int i;
FILE *outfile;
if((outfile=fopen(outfilename,"w"))==NULL)
{ nrerror("Cannot open output file!");}
//print to output file
fprintf(outfile,"\n");
for(i=0;i<MAXLEN;i++)
fprintf(outfile,"=");
fprintf(outfile,"\n\tInStruct by Gao, Williamson and Bustamante (2007)\n");
fprintf(outfile,"\t\t Code by Hong Gao\n");
fprintf(outfile,"\t\tVersion 1.0 (May. 2007)\n");
for(i=0;i<MAXLEN;i++)
fprintf(outfile,"=");
fprintf(outfile,"\n\n\n\nCommand line arguments:\n ");
for(i=0;i<argc;i++)
fprintf(outfile,"%s ",argv[i]);
fprintf(outfile,"\n\n");
fprintf(outfile,"Data File: %s\n",datafilename);
if(initialfilename!=NULL) fprintf(outfile,"Initial File: %s\n",initialfilename);
fprintf(outfile,"Output File: %s\n\n",outfilename);
fprintf(outfile,"\nRun parameters:\n");
fprintf(outfile," Chain Number=%d\n",chainnum);
fprintf(outfile," MCMC Iterations Number=%ld\n",updatenum);
fprintf(outfile," Burn-in=%ld\n",burnin);
fprintf(outfile," Thinning=%d\n",thinning);
fprintf(outfile," Ploid=%d\n",data.ploid);
if(data.ploid>2)
{ if(data.autopoly==1) fprintf(outfile,"Autopolyploid assumed\n");
else{ if(data.autopoly==0) fprintf(outfile,"Allopolyploid assumed\n");}
}
fprintf(outfile," Missing Data=%s\n",data.missingdata);
fprintf(outfile," Population size=%d\n",data.totalsize);
fprintf(outfile," Number of loci=%d\n",data.locinum);
fprintf(outfile," Population number assumed=%d\n",data.popnum);
fprintf(outfile," Significance level for Posterior Credible Interval=%f\n",data.siglevel);
fprintf(outfile," Mode = ");
if(data.ploid==2)
{
switch(data.mode)
{
case 0: fprintf(outfile,"Make inference of population structure only without admixture.\n");break;
case 1: fprintf(outfile,"Make inference of population structure only with admixture.\n");break;
case 2: fprintf(outfile,"Make inference of population structure and the selfing rates for subpopulations.\n");break;
case 3: fprintf(outfile,"Make inference of population structure and the selfing rates for individuals.\n");break;
case 4: fprintf(outfile,"Make inference of population structure and the inbreeding coefficients for subpopulations.\n");break;
case 5: fprintf(outfile,"Make inference of population structure and the inbreeding coefficients for individuals.\n");break;
}
}
else{
if(data.ploid==4)
{ fprintf(outfile,"Make inference of population structure and the selfing rates for subpopulations.\n");}
}
if(data.inf_K==1)
{ fprintf(outfile,"\nMake inference of the number of subpopulations.\n");}
if(data.mode==3||data.mode==5)
{
if(data.prior_flag==0) fprintf(outfile,"The Uniform prior is used for selfing rates.\n\n");
if(data.prior_flag==1) fprintf(outfile,"The Dirichlet Process prior is used for selfing rates and the scaling parameter is %f.\n\n",alpha_dpm);
}
switch(data.back_refl)
{
case 0: fprintf(outfile,"The proposal method for selfing rates is adaptive independence sampler.\n");break;
case 1: fprintf(outfile,"The proposal method for selfing rates is back-reflection.\n"); break;
}
if(data.print_freq==1)
{ fprintf(outfile,"The posterior allele frequencies will also be summarized and written to output file.\n");}
if(GR_flag==1) fprintf(outfile,"The %d stored iteration results after burn-in will be used to calculate the GR statistic.\n",ckrep);
if(data.distr_fmt==1)
{ fprintf(outfile,"The output of Q are generated in the Distruct format.\n");}
fclose(outfile);
}
void inf_K_val(char *outfilename, int n_small, int n_large, SEQDATA *data, INIT initial)
{
//flag=0;
int parallelism_enabled = 1; //0=no, not 0 = yes
int pid = getpid();
#pragma omp parallel if(parallelism_enabled)
{
int K,chn,K_best,K_num;//flag,,temp;
double **dic,*val_K;
CONVG cvg;
FILE *outfile;
CHAIN chain;
if(n_large<1||n_small<1||n_small>n_large)
{
n_small=1;
n_large=(int)pow((double)data->totalsize,0.3)+1;
fprintf(stdout,"The range of value for K is not correct! Change to default value (%d - %d)!\n",n_small,n_large);
}
K_num=n_large-n_small+1;
dic=dmatrix(0,K_num-1,0,initial.chainnum-1);
val_K=dvector(0,K_num-1);
/*//flag=0;
int parallelism_enabled = 1; //0=no, not 0 = yes
#pragma omp parallel if(parallelism_enabled)
{*/
//initialize private variables
char foutfilename[strlen(outfilename)];
#pragma omp for
for(K=n_small;K<=n_large;K++)
{
//modify outfilename so each K produces its own output
strcpy(foutfilename,outfilename); //start with original outfilename
//int K to char pK
int ndigits = floor(log10(K)) + 1;
char pK[ndigits]; //char array to contain int, sized to length of K
sprintf(pK, "%d", K); //convert integer K into a char array
//int pid to char pid
char ppid[(sizeof(int)*CHAR_BIT-1)/3 + 3];
sprintf(ppid, "%d", pid); //convert integer pid into a char array
char pp[2] = ".";
strncat(foutfilename, pp, 1); //add terminal "."
strncat(foutfilename, ppid, strlen(ppid)); //add process id
strncat(foutfilename, pp, 1); //add "."
strncat(foutfilename, pK, strlen(pK)); //add "K"
int tid = omp_get_thread_num();
printf("t%d,K%d,pK%s,pp%s,out(%s),fout(%s),pid(%d)\n",tid,K,pK,pp,outfilename,foutfilename,pid);
//getchar();
data->popnum=K;
printf("t%d,data.popnum%d\n",tid,(*data).popnum);
if((outfile=fopen(foutfilename,"a+"))==NULL)
{ nrerror("Cannot open output file!");}
fprintf(outfile,"\n\nThe current K is %d\n",K);
fclose(outfile);
if(GR_flag==1) allocate_convg(*data,&cvg,chainnum,ckrep,convgfilename);
for(chn=0;chn<initial.chainnum;chn++)
{
chain=mcmc_updating((*data),initial,chn,&cvg);
dic[K-n_small][chn]=chain_stat(foutfilename,chain,*data,chn);
free_chain(&chain,(*data));
}
if(GR_flag==1)
{
//temp=
chain_converg(foutfilename,&cvg);
free_convg(&cvg);
}
//if(temp==1) //bad convergence or only one chain
//{ flag=1;}
}
} //end omp parallel if
//COMBINE SEPARATE OUTPUT FILES
printf("combining files.\n");
char foutfilename[strlen(outfilename)];
int K;
for(K=n_small;K<=n_large;K++)
{
//modify outfilename so each K produces its own output
strcpy(foutfilename,outfilename); //start with original outfilename
//int K to char pK
int ndigits = floor(log10(K)) + 1;
char pK[ndigits]; //char array to contain int, sized to length of K
sprintf(pK, "%d", K); //convert integer K into a char array
//int pid to char pid
char ppid[(sizeof(int)*CHAR_BIT-1)/3 + 3];
sprintf(ppid, "%d", pid); //convert integer pid into a char array
char pp[2] = ".";
strncat(foutfilename, pp, 1); //add terminal "."
strncat(foutfilename, ppid, strlen(ppid)); //add process id
strncat(foutfilename, pp, 1); //add "."
strncat(foutfilename, pK, strlen(pK)); //add "K"
//open the base file (head) and the file to append (tail)
FILE *head = fopen(outfilename, "ab");
FILE *tail = fopen(foutfilename, "rb");
if (!head || !tail) abort();
//append
char buf[1024];
size_t n;
while ((n = fread(buf, 1, sizeof buf, tail)) > 0)
if (fwrite(buf, 1, n, head) != n) abort();
if (ferror(tail)) abort();
//close
fclose(head);
fclose(tail);
//delete the file just appended (to do)
}
/*
//CALCULATE BEST K USING DIC
//if(flag==1) //use the minimum DIC
//{
for(K=0;K<K_num;K++)
{
val_K[K]=dic[K][find_min(dic[K],initial.chainnum)];
}
K_best=find_min(val_K,K_num)+n_small;
//}
if((outfile=fopen(outfilename,"a+"))==NULL)
{ nrerror("Cannot open output file!");}
fprintf(outfile,"\n\nThe range of value for K is (%d - %d)!\n",n_small,n_large);
fprintf(outfile,"The optimal K is %d\n",K_best);
fclose(outfile);
free_dmatrix(dic,0,K_num-1,0,initial.chainnum-1);
free_dvector(val_K,0,K_num-1);
*/
}