void calc_stat(int begI,int endI,perChr &pc,kstring_t &str,int nChr){ double tW = slice(begI,endI,pc.tW); double tP = slice(begI,endI,pc.tP); double tF = slice(begI,endI,pc.tF); double tH = slice(begI,endI,pc.tH); double tL = slice(begI,endI,pc.tL); ksprintf(&str,"%f\t%f\t%f\t%f\t%f\t",tW,tP,tF,tH,tL); double tajima=tajd(nChr,tW,tP); double fuf = fulif(nChr,tW,tF,tP); double fud = fulid(nChr,tW,tF); double fayh_val = fayh(nChr,tW,tH,tP); double zeng_val = zenge(nChr,tW,tL); // fprintf(stderr,"%f\n",tajima); ksprintf(&str,"%f\t%f\t%f\t%f\t%f",tajima,fuf,fud,fayh_val,zeng_val); }
void statsms::readms (string filename){ ifstream inp (filename.c_str()); if (!inp.is_open()){ cerr << "Error reading file "<< filename <<endl; exit(1); } string line; int linenum = 0; int chrindex = 0; int snpindex = 0; int nsnp = 0; double totalw = 0; ofstream snpfs (snpfile.c_str()); int totalsnps = 0; vector<int> nsegsites (nsamples,0); int totalnind = 0 ; vector<int> nind (nsamples,0); for (int i = 0 ; i < nsamples; i++) { nind[i] = samples[i]; totalnind += nind[i]; } char ** snpmat = new char*[totalnind]; double **freq = new double*[nsamples]; double **td = new double *[nsamples]; double **thetaw = new double *[nsamples]; for (int i = 0 ; i < nsamples; i++) { td[i] = new double[reps]; thetaw[i] = new double[reps]; } double abba = 0 ; double baba = 0 ; double abba2 = 0 ; double baba2 = 0 ; double fstnum = 0; double fstdenom = 0; double l2yn = 0 ; double l2en = 0; double l2denom = 0; vector<int> pop1nd10 (nind[0]+1,0); vector<int> pop1nd01 (nind[0]+1,0); vector<int> pop2nd10 (nind[1]+1,0); vector<int> pop2nd01 (nind[1]+1,0); vector<int> pop1nd00 (nind[0]+1,0); vector<int> pop2nd00 (nind[1]+1,0); vector<int> pop1nd11 (nind[0]+1,0); vector<int> pop2nd11 (nind[1]+1,0); vector<int> pop1n1 (nind[0]+1,0); vector<int> pop2n1 (nind[0]+1,0); vector<int> pop1d1 (nind[0]+1,0); vector<int> pop2d1 (nind[1]+1,0); vector<int> pop1n0 (nind[0]+1,0); vector<int> pop2n0 (nind[0]+1,0); vector<int> pop1d0 (nind[0]+1,0); vector<int> pop2d0 (nind[1]+1,0); vector<double> chrabba = vector<double>(reps,0); vector<double> chrbaba = vector<double>(reps,0); vector<double> chrsnps = vector<double>(reps,0); vector<double> chrfstnum = vector<double> (reps,0); vector<double> chrfstdenom = vector<double> (reps,0); vector<double> chrw = vector<double> (reps,0); int totalnsnps = 0; int rep = 0 ; int index = 0; string id; while ( std::getline (inp, line)){ linenum ++; io::println ("line = " + line, 2); char c = line[0]; if (c=='#') continue; if (line.empty()) continue; if ( line.find ("segsites:")==0) { string l1 = line.substr(9,line.length()-9); nsnp = atoi (l1.c_str()); for (int i = 0 ; i < totalnind ; i++) { snpmat[i] = new char[nsnp]; } for (int i = 0 ; i < nsamples; i++){ freq[i] = new double[nsnp]; } io::println ("SNPs = " + tostring(nsnp),2); } if ( line.find ("positions:")==0) { vector<double> genmap ; vector<double> physmap; vector<string> snpids; vector<vector<int> > geno; vector<vector<int> > haplo; chrindex ++; string l1 = line.substr(10,line.length()-10); //cout << "l1 = " << l1 << endl; // io::println ("rep = " + tostring(rep),2); //cout << " rep = " << rep << endl; rep++; istringstream ss (line.substr(10,line.length()-10)); int count = 0 ; while (!ss.eof()){ double pos; ss>>pos; // cout << pos << endl; double ppos = pos * length; double gpos = ppos * rho; if (count >= nsnp ) break; count++; genmap.push_back (gpos); physmap.push_back (ppos); id = "snp" + tostring(index); snpids.push_back (id); index++; } for (int i = 0 ; i < totalnind ; i ++){ string s1, s2; std::getline (inp, s1); for (int j = 0 ; j < nsnp ; j++) { snpmat[i][j] = s1[j]; } } vector<int> nsegsites(nsamples,0); vector<double> pi(nsamples, 0 ); vector<double> h(nsamples,0); vector<double> th(nsamples,0); for (int i = 0, t = 0 ; i < nsamples; i++){ for (int j = 0 ; j < nsnp; j++){ freq[i][j] = frequency('1', j, samples[i], snpmat+t)/((double)samples[i]); nsegsites[i] += (freq[i][j] > 0 && freq[i][j] < 1.0); } t += samples[i]; } chrsnps[rep-1] = nsnp; totalnsnps += nsnp; for (int j = 0 ; j < nsnp ; j++) { if (nsamples >= 5 ) { if (ascertain.compare ("african")==0) { if (freq[2][j] == 0 && freq[3][j] == 0 && freq[4][j] ==0 && freq[1][j]==0) continue; } } if (freq[2][j]>=1){ abba += freq[1][j] * ( 1-freq[0][j]); baba += freq[0][j] * ( 1-freq[1][j]); chrabba[rep-1] += freq[1][j] * ( 1-freq[0][j]); chrbaba [rep-1] += freq[0][j] * ( 1-freq[1][j]); } if (freq[2][j]>0){ int ind = round(freq[0][j]*samples[0]); pop1n1[ind]++; ind = round(freq[1][j]*samples[1]); pop2n1[ind]++; } else { int ind = round(freq[0][j]*samples[0]); pop1n0[ind]++; ind = round(freq[1][j]*samples[1]); pop2n0[ind]++; } if (nsamples >= 4 ) { if (freq[3][j]>=1){ abba2 += freq[1][j] * ( 1-freq[0][j]); baba2 += freq[0][j] * ( 1-freq[1][j]); } if (freq[3][j] > 0 ) { int ind = round(freq[0][j]*samples[0]); pop1d1[ind]++; ind = round(freq[1][j]*samples[1]); pop2d1[ind]++; } else { int ind = round(freq[0][j]*samples[0]); pop1d0[ind]++; ind = round(freq[1][j]*samples[1]); pop2d0[ind]++; } // cout << "here1" << endl; if (freq[2][j]>0 && freq[3][j]==0) { int ind = round(freq[0][j]*samples[0]); pop1nd10[ind]++; // cout << "here2\t" << ind << "\t" << freq[0][j] << "\t" << samples[0]<< endl; ind = round(freq[1][j]*samples[1]); pop2nd10[ind]++; } if (freq[3][j]>0 && freq[2][j]==0) { int ind = round(freq[0][j]*samples[0]); pop1nd01[ind]++; // cout << "here3\t" << ind << "\t" << freq[0][j] << "\t" << samples[0]<< endl; ind = round(freq[1][j]*samples[1]); pop2nd01[ind]++; } if (freq[2][j]==0 && freq[3][j]==0) { int ind = round(freq[0][j]*samples[0]); pop1nd00[ind]++; ind = round(freq[1][j]*samples[1]); pop2nd00[ind]++; } if (freq[2][j]>0 && freq[3][j]>0) { int ind = round(freq[0][j]*samples[0]); pop1nd11[ind]++; ind = round(freq[1][j]*samples[1]); pop2nd11[ind]++; } } if (freq[0][j]<=0 &&& freq[1][j]>0 && freq[1][j]<1) { l2en +=pow (freq[0][j] - freq[2][j],2); l2yn +=pow (freq[1][j] - freq[2][j],2); l2denom ++; } int n0 = samples[0]; int n1 = samples[1]; int a0 = freq[0][j] * n0; int a1 = freq[1][j] * n1 ; double w = 1; double f0 = freq[0][j]; double f1 = freq[1][j]; if (ascertain.compare("hetinb")==0) { a1 --; // Account for ascertainment n1 -= 2; f1 = ((double)a1)/n1; w = 2*freq[1][j]*(1-freq[1][j]); } else if (ascertain.compare("outgroup")==0) { if (freq[2][j]>0) w = 1 ; else w = 0; } else if (ascertain.compare ("all")==0) { w = 1; } else if (ascertain.compare ("hetinout") == 0) { w = freq[2][j]*(1-freq[2][j]); } else if (ascertain.compare ("polyinb")==0) { if (freq[1][j]>0 && freq[1][j] < 1 ) w = 1; else w = 0; } else if (ascertain.compare ("polyinanb")==0){ if (freq[1][j]>0 && freq[1][j] < 1 && freq[0][j] > 0 && freq[0][j] < 1) w = 1; else w = 0; } else if (ascertain.compare ("polyinaorb")==0) { if ((freq[1][j]>0 && freq[1][j] < 1)||(freq[0][j] > 0 && freq[0][j] < 1)) w = 1; else w = 0; } else if (ascertain.compare ("polyina")==0) { if (freq[0][j]>0 && freq[0][j] < 1 ) w = 1; else w = 0; } else if (ascertain.compare ("hetina")==0) { a0 --; // Account for ascertainment n0 -= 2; f0 = ((double)a0)/n0; w = 2*freq[0][j]*(1-freq[0][j]); } double h0 = 0 ; double h1 = 0 ; if (n0 > 1) h0 = a0*(n0-a0)/(n0*(n0-1.)); if (n1 > 1) h1 = a1*(n1-a1)/(n1*(n1-1.)); double fstn = pow((f0-f1),2) - h0/n0 - h1/n1; double fstd = fstn + h0 + h1; fstnum += fstn * w; fstdenom += fstd * w; chrfstnum[rep-1] += fstn * w; chrfstdenom[rep-1] += fstd * w; chrw[rep-1] += w; totalw += w; } for (int i = 0 , t = 0 ; i < nsamples; i++) { pi[i] = nucdiv (samples[i], nsnp, snpmat +t ); h[i] = hfay (samples[i], nsnp, snpmat +t ); th[i] = thetah (samples[i], nsnp, snpmat +t ); td [i][rep-1] = tajd(samples[i], nsegsites[i], pi[i]); thetaw[i][rep-1] = nsegsites[i]/(2*a1f(samples[i])); t += samples[i]; } for (int i = 0 ; i < nsamples; i++) delete[] freq[i]; for (int i = 0 ; i < totalnind; i++) delete[] snpmat[i]; if (printsnplist) { for (int j =0 ; j < nsnp; j++){ snpfs << snpids[j] << "\t" << chrindex << "\t" << genmap[j] << "\t" << physmap[j] << endl; } } } // One set of MS sims if (io::debug >= 2) { cout << "Next MS sim " <<endl; } }
/********************************************************* * * * Main Function * * ------------- * * * ********************************************************* | Takes arguments and launches the gof simulations. | *-------------------------------------------------------*/ int main(int argc, char *argv[]) // Array of char=arguments line { //--- Declarations & Call Function ---// int i=0, j=0, k=0, count=0, howmany=0, segsites=0, okim=0, oksim=0, numsim=0, totsim=0, nokl=0, nokl0=0; int **statseg=NULL, **nbvariant=NULL, oks[3], okstot[3]; FILE *pf=NULL, *fopen(const char*, const char*); // Pointer on File for outputs (pf) and IM input file double tajd(); char **list=NULL; // Haplotype list void updatemainparams(struct params*); int gensam(struct params*, char**, int**, int*); int **imatrix(int, int); //// From rand1.c //// /* Celine changed 03/18/2010 */ void seedit(char*, FILE*, struct params *);/*/////*/ char **cmatrix(int, int); //// From params.c //// void changeparams(struct params*); void changeparamslocus(struct params*, int); struct params getpars(int, char*[], int*); //--- Structure declaration---// struct params param; //--- Get arguments ---// param=getpars(argc, argv, &howmany); // Get input by user for parameters pf=stdout; // Output /* Celine changed 03/18/2010 */ if( !param.commandlineseedflag ) seedit("s", pf, ¶m);// WRITE seeds in summary output file /*/////*/ /* Uncommented for Celine's use */ /* for(i=0;i<argc;i++) // Information on simulation fprintf(pf, "%s ", argv[i]); */////\ //---------- Initialisation & Memory allocation ------------------// nbvariant=imatrix(param.cp.npop+1, maxsites); // array of nb of frequency spectrum typeseg=(int*)malloc((unsigned)(maxsites*sizeof(int))); // type of sites statseg=imatrix(param.cp.npop+3, howmany); // Records locus specific S1, S2, Ss, Sf, changeparams(¶m); // Change estimates parameters from priors updatemainparams(¶m); // Update parameters for the coalescent oksim=totsim=okstot[0]=okstot[1]=okstot[1]=okstot[2]=0; // simulation check, total #of sim, check on statistics for all simulations for(numsim=0; numsim<param.cp.nsim;numsim++) // Loop along number of simulations for this set of parameters { //--- Initialization and reset of quality checks ---// count=nokl=nokl0=okim=oks[0]=oks[1]=oks[1]=oks[2]=0; // number of loci, # loci with ok genealogies, no set sites, #statistics ok for(i=0;i<11;i++) // Sim specific Stats param.cp.sSiFst[i]=0; //--- Loop along the loci in the simulation ---// while((howmany-count++)) { if(okim==0) // Case All loci ok in the sample { for(i=0;i<11;i++) { param.cp.lSiFst[i]=0.0; if(i<9) param.lp[count-1].tpH[i]=0; } changeparamslocus(¶m, count-1); // Get locus specific parameters list=cmatrix(param.cp.nsam, maxsites+1); // Allocate list of haplotypes segsites=gensam(¶m, list, nbvariant, param.lp[count-1].S);// Generate a new gene ARG statseg[0][count-1]=segsites; // Total number of seg sites in sample for(i=1;i<3+param.cp.npop;i++) statseg[i][count-1]=0; if((segsites>0)) // Case segsite>0: get stats { /* fprintf(pf, "segsites:%d\npositions:\n",segsites); */ /* for(i=0;i<param.cp.nsam;i++) fprintf(pf, "%s\n", list[i]); */ /* fprintf(pf, "\n"); */ if(segsites<=param.cp.nsites) // Case segsite < lenght of locus { for(k=0;k<param.cp.nsam;k++) { for(i=k+1;i<param.cp.nsam;i++) { if((k<param.lp[count-1].ni[1])&&(i<param.lp[count-1].ni[1])) // pop1 { param.lp[count-1].tpH[0]++; // # chromosomes for(j=0;j<segsites;j++) { if(list[k][j]!=list[i][j]) param.lp[count-1].tpH[1]++; // # seg sites } } else if((k>=param.lp[count-1].ni[1])&&(i>=param.lp[count-1].ni[1])) // pop2 { param.lp[count-1].tpH[2]++; // # chromosomes for(j=0;j<segsites;j++) { if(list[k][j]!=list[i][j]) param.lp[count-1].tpH[3]++; // # seg sites } } else // total sample { param.lp[count-1].tpH[4]++;; // Totsal sample size for(j=0;j<segsites;j++) { if(list[k][j]!=list[i][j]) param.lp[count-1].tpH[5]++; // total S } } }// Loop on chromosome }// Loop along all sampled sequence for the locus for(i=0;i<segsites;i++) // Calulate S statistics for the locus { if(typeseg[i]<0) statseg[3][count-1]++; // shared else if(typeseg[i]<param.cp.npop+1) statseg[typeseg[i]][count-1]++; // population specific else statseg[4][count-1]++; // fixed } for(i=1;i<5;i++) //--- Record S1 S2 Ss Sf forthe locus ---// param.cp.lSiFst[i-1]+=statseg[i][count-1]; for(i=0;i<param.cp.nsam;i++) // Free memory for this locus free(list[i]); free(list); }// End case segsite<lenght of locus else // Case segsites>lenght of locus { okim=1; // Sample have a wrong locus oksim=1; // stop this simulation break; } }// End locus polymorphic else // Locus without seg sites { okim=1; // Sample have a wrong locus (S=0) oksim=2; // 0 for all stats } }// End Sample good until now if(okim==0) // All loci good until now { nokl++; // +1 good locus nokl0++; // +1 polymorphic locus for(i=0;i<7;i++) { if(i<4) param.cp.sSiFst[i]+=param.cp.lSiFst[i]; // sum of Sk param.lp[count-1].H[i]=param.lp[count-1].tpH[i]; // locus specific stats } param.cp.lSiFst[5]=param.lp[count-1].H[1]/=param.lp[count-1].H[0]; // Hw1 param.cp.lSiFst[6]=param.lp[count-1].H[3]/=param.lp[count-1].H[2]; // Hw2 param.lp[count-1].H[5]/=param.lp[count-1].H[4]; // Hb param.cp.lSiFst[4]=param.lp[count-1].H[6]=1-((param.lp[count-1].H[1]+param.lp[count-1].H[3])/2)/ param.lp[count-1].H[5];// locis specific Fst if((param.lp[count-1].S[0]>0)&&(param.lp[count-1].ni[1]>2)) // Locus popymorphic in pop1 { param.cp.lSiFst[7]=tajd(param.lp[count-1].ni[1], param.lp[count-1].S[0], param.lp[count-1].H[1]); param.cp.sSiFst[7]+=param.cp.lSiFst[7]; oks[0]++; // +1 good stat for pop1 } if((param.lp[count-1].S[1]>0)&&(param.lp[count-1].ni[2]>2)) // Locus popymorphic in pop2 { param.cp.lSiFst[8]=tajd(param.lp[count-1].ni[2], param.lp[count-1].S[1], param.lp[count-1].H[3]); param.cp.sSiFst[8]+=param.cp.lSiFst[8]; oks[1]++; // +1 good stat for pop2 } if(statseg[1][count-1]>0) // Locus popymorphic private in pop1 param.lp[count-1].H[7]=param.cp.lSiFst[9]+=(double) param.lp[count-1].S[2]/(statseg[1][count-1]*param.lp[count-1].ni[1]*2); // p(1) if(statseg[2][count-1]>0) // Locus popymorphic private in pop2 param.lp[count-1].H[7]=param.cp.lSiFst[9]+=(double) param.lp[count-1].S[3]/(statseg[2][count-1]*param.lp[count-1].ni[2]*2); // p(1) param.cp.sSiFst[9]+=param.cp.lSiFst[9]; // sum p1 if(statseg[3][count-1]>0) // Locus popymorphic private in pop2 { param.lp[count-1].H[8]=param.cp.lSiFst[10]=(double) param.lp[count-1].S[4]/(statseg[3][count-1]*(param.lp[count-1].ni[2]+param.lp[count-1].ni[1])); // p(2) param.cp.sSiFst[10]+=param.cp.lSiFst[10]; // sum p2 oks[2]++; } param.cp.sSiFst[4]+=param.lp[count-1].H[6]; // sum Fst param.cp.sSiFst[5]+=param.lp[count-1].H[1]; // sum Hw1 param.cp.sSiFst[6]+=param.lp[count-1].H[3]; // sum Hw2 } else if(oksim==2) // Case no seg site for that locus { oksim=0; // reset checks okim=0; nokl++; // 1+ locus to count in mean (all 0 values) for(i=0;i<9;i++) param.lp[count-1].H[i]=param.lp[count-1].tpH[i]; // locus specific stats } }// End loop on loci if(nokl==howmany) // All sample good { totsim++; // 1+ good simulation for(i=0;i<4;i++) param.cp.SiFst[i]+=param.cp.sSiFst[i]; // sum of S stats along simulations for(i=4;i<11;i++) { if(((i<7)||(i>=9))&&(nokl0>0)) param.cp.SiFst[i]+=(double)param.cp.sSiFst[i]/nokl0; // mean of other stats along simulations if((i>=7)&&(i<9)&&(oks[i-7]>0)) { param.cp.SiFst[i]+=(double)param.cp.sSiFst[i]/oks[i-7]; okstot[i-7]++; } } } }// End loop on simulations if(oksim==0) // All simulations worked { /* Uncommented for Celine's use */ /* for(i=0;i<11;i++) *///// for(i=0;i<9;i++) { if((i<7)||(i>=9)) fprintf(pf, "%lg\t", (double) param.cp.SiFst[i]/(totsim)); // write mean of sum of S stats, Fst and Hws over simulations if((i>=7)&&(i<9)) { if(oks[i-7]>0) fprintf(pf, "%lg\t", (double) param.cp.SiFst[i]/(okstot[i-7])); // write mean Tds if S>0 in pops else fprintf(pf, "NA\t" ); } } fprintf(pf, "\n"); } else // Case one locus with too much seg sites { for(i=0;i<9;i++) fprintf(pf, "NA\t" ); fprintf(pf, "\n"); } /* Celine changed 03/18/2010 */ seedit("end", pf, ¶m); // in randx.c, flag[0]!="s" so create/rewrite seed in seedmimar /*/////*/ fclose(pf); free(typeseg); for(i=0;i<param.cp.npop+3;i++) { if(i<param.cp.npop+1) free(nbvariant[i]); free(statseg[i]); } free(nbvariant); free(statseg); ///////// FREE PARAM /////// for(i=0;i<param.cp.npop;i++) free(param.cp.mig_mat[i]); free(param.cp.mig_mat); free(param.cp.config); /* Celine changed 11/27/2009 */ for(i=9;i>=0;i--) if(param.cp.listevent[i]!=NULL && param.cp.listevent[i]->nextde!=NULL) free(param.cp.listevent[i]->nextde); if(param.cp.listevent!=NULL) free(param.cp.listevent); /*/////*/ free(param.cp.deventlist); free(param.cp.size); free(param.cp.alphag); for(i=0;i<3;i++) free(param.cp.uniform[i]); free(param.cp.uniform); free(param.cp.oldest); free(param.cp.newest); free(param.cp.newparam); /* Celine changed 11/27/2009 */ for(i=0;i<howmany;i++) free(param.lp[i].name); /*/////*/ free(param.lp); /* Celine changed 03/18/2010 */ free( param.tableseeds); /*/////*/ exit(0); }// End main function