void sim_geno(int n_ind, int n_pos, int n_gen, int n_draws, int *geno, double *rf, double *rf2, double error_prob, int *draws, double initf(int, int *), double emitf(int, int, double, int *), double stepf(int, int, double, double, int *)) { int i, k, j, v, v2; double s, **beta, *probs; int **Geno, ***Draws, curstate; int cross_scheme[2]; /* cross scheme hidden in draws argument; used by hmm_bcsft */ cross_scheme[0] = draws[0]; cross_scheme[1] = draws[1]; draws[0] = 0; draws[1] = 0; /* allocate space for beta and reorganize geno and draws */ /* Geno indexed as Geno[pos][ind] */ /* Draws indexed as Draws[rep][pos][ind] */ reorg_geno(n_ind, n_pos, geno, &Geno); reorg_draws(n_ind, n_pos, n_draws, draws, &Draws); allocate_alpha(n_pos, n_gen, &beta); allocate_double(n_gen, &probs); /* Read R's random seed */ GetRNGstate(); for(i=0; i<n_ind; i++) { /* i = individual */ R_CheckUserInterrupt(); /* check for ^C */ /* do backward equations */ /* initialize beta */ for(v=0; v<n_gen; v++) beta[v][n_pos-1] = 0.0; /* backward equations */ for(j=n_pos-2; j>=0; j--) { for(v=0; v<n_gen; v++) { beta[v][j] = beta[0][j+1] + stepf(v+1,1,rf[j], rf2[j], cross_scheme) + emitf(Geno[j+1][i],1,error_prob, cross_scheme); for(v2=1; v2<n_gen; v2++) beta[v][j] = addlog(beta[v][j], beta[v2][j+1] + stepf(v+1,v2+1,rf[j],rf2[j], cross_scheme) + emitf(Geno[j+1][i],v2+1,error_prob, cross_scheme)); } } for(k=0; k<n_draws; k++) { /* k = simulation replicate */ /* first draw */ /* calculate probs */ s = (probs[0] = initf(1, cross_scheme)+emitf(Geno[0][i],1,error_prob, cross_scheme)+beta[0][0]); for(v=1; v<n_gen; v++) { probs[v] = initf(v+1, cross_scheme) + emitf(Geno[0][i], v+1, error_prob, cross_scheme) + beta[v][0]; s = addlog(s, probs[v]); } for(v=0; v<n_gen; v++) probs[v] = exp(probs[v] - s); /* make draw: returns a value from {1, 2, ..., n_gen} */ curstate = Draws[k][0][i] = sample_int(n_gen, probs); /* move along chromosome */ for(j=1; j<n_pos; j++) { /* calculate probs */ for(v=0; v<n_gen; v++) probs[v] = exp(stepf(curstate,v+1,rf[j-1],rf2[j-1], cross_scheme) + emitf(Geno[j][i],v+1,error_prob, cross_scheme) + beta[v][j] - beta[curstate-1][j-1]); /* make draw */ curstate = Draws[k][j][i] = sample_int(n_gen, probs); } } /* loop over replicates */ } /* loop over individuals */ /* write R's random seed */ PutRNGstate(); }
/********************************************************************** * * sim_ril * * n_chr Number of chromosomes * n_mar Number of markers on each chromosome (vector of length n_chr) * n_ril Number of RILs to simulate * * map Vector of marker locations, of length sum(n_mar) * First marker on each chromosome should be at 0. * * n_str Number of parental strains (either 2, 4, or 8) * * m Interference parameter (0 is no interference) * p For Stahl model, proportion of chiasmata from the NI model * * include_x Whether the last chromosome is the X chromosome * * random_cross Indicates whether the order of the strains in the cross * should be randomized. * * selfing If 1, use selfing; if 0, use sib mating * * cross On output, the cross used for each line * (vector of length n_ril x n_str) * * ril On output, the simulated data * (vector of length sum(n_mar) x n_ril) * * origgeno Like ril, but with no missing data * * error_prob Genotyping error probability (used nly with n_str==2) * * missing_prob Rate of missing genotypes * * errors Error indicators (n_mar x n_ril) * **********************************************************************/ void sim_ril(int n_chr, int *n_mar, int n_ril, double *map, int n_str, int m, double p, int include_x, int random_cross, int selfing, int *cross, int *ril, int *origgeno, double error_prob, double missing_prob, int *errors) { int i, j, k, ngen, tot_mar, curseg; struct individual par1, par2, kid1, kid2; int **Ril, **Cross, **Errors, **OrigGeno; int maxwork, isX, flag, max_xo, *firstmarker; double **Map, maxlen, chrlen, *work; /* count total number of markers */ for(i=0, tot_mar=0; i<n_chr; i++) tot_mar += n_mar[i]; reorg_geno(tot_mar, n_ril, ril, &Ril); reorg_geno(n_str, n_ril, cross, &Cross); reorg_geno(tot_mar, n_ril, errors, &Errors); reorg_geno(tot_mar, n_ril, origgeno, &OrigGeno); /* allocate space */ Map = (double **)R_alloc(n_chr, sizeof(double *)); Map[0] = map; for(i=1; i<n_chr; i++) Map[i] = Map[i-1] + n_mar[i-1]; /* location of first marker */ firstmarker = (int *)R_alloc(n_chr, sizeof(int)); firstmarker[0] = 0; for(i=1; i<n_chr; i++) firstmarker[i] = firstmarker[i-1] + n_mar[i-1]; /* maximum chromosome length (in cM) */ maxlen = Map[0][n_mar[0]-1]; for(i=1; i<n_chr; i++) if(maxlen < Map[i][n_mar[i]-1]) maxlen = Map[i][n_mar[i]-1]; /* allocate space for individuals */ max_xo = (int)qpois(1e-10, maxlen/100.0, 0, 0)*6; if(!selfing) max_xo *= 5; allocate_individual(&par1, max_xo); allocate_individual(&par2, max_xo); allocate_individual(&kid1, max_xo); allocate_individual(&kid2, max_xo); maxwork = (int)qpois(1e-10, (m+1)*maxlen/50.0, 0, 0)*3; work = (double *)R_alloc(maxwork, sizeof(double)); for(i=0; i<n_ril; i++) { /* set up cross */ for(j=0; j<n_str; j++) Cross[i][j] = j+1; if(random_cross) int_permute(Cross[i], n_str); for(j=0; j<n_chr; j++) { isX = include_x && j==n_chr-1; chrlen = Map[j][n_mar[j]-1]; par1.n_xo[0] = par1.n_xo[1] = par2.n_xo[0] = par2.n_xo[1] = 0; /* initial generations */ if(n_str==2) { par1.allele[0][0] = par2.allele[0][0] = 1; par1.allele[1][0] = par2.allele[1][0] = 2; } else if(n_str==4) { par1.allele[0][0] = 1; par1.allele[1][0] = 2; par2.allele[0][0] = 3; par2.allele[1][0] = 4; } else { /* 8 strain case */ par1.allele[0][0] = 1; par1.allele[1][0] = 2; par2.allele[0][0] = 3; par2.allele[1][0] = 4; docross(par1, par2, &kid1, chrlen, m, p, 0, &maxwork, &work); par1.allele[0][0] = 5; par1.allele[1][0] = 6; par2.allele[0][0] = 7; par2.allele[1][0] = 8; docross(par1, par2, &kid2, chrlen, m, p, isX, &maxwork, &work); copy_individual(&kid1, &par1); copy_individual(&kid2, &par2); } /* start inbreeding */ ngen=1; while(1) { R_CheckUserInterrupt(); /* check for ^C */ docross(par1, par2, &kid1, chrlen, m, p, 0, &maxwork, &work); if(!selfing) docross(par1, par2, &kid2, chrlen, m, p, isX, &maxwork, &work); /* are we done? */ flag = 0; if(selfing) { if(kid1.n_xo[0] == kid1.n_xo[1]) { for(k=0; k<kid1.n_xo[0]; k++) { if(kid1.allele[0][k] != kid1.allele[1][k] || fabs(kid1.xoloc[0][k] - kid1.xoloc[1][k]) > 1e-6) { flag = 1; break; } } if(kid1.allele[0][kid1.n_xo[0]] != kid1.allele[1][kid1.n_xo[0]]) flag = 1; } else flag = 1; } else { if(kid1.n_xo[0] == kid1.n_xo[1] && kid1.n_xo[0] == kid2.n_xo[0] && kid1.n_xo[0] == kid2.n_xo[1]) { for(k=0; k<kid1.n_xo[0]; k++) { if(kid1.allele[0][k] != kid1.allele[1][k] || kid1.allele[0][k] != kid2.allele[0][k] || kid1.allele[0][k] != kid2.allele[1][k] || fabs(kid1.xoloc[0][k] - kid1.xoloc[1][k]) > 1e-6 || fabs(kid1.xoloc[0][k] - kid2.xoloc[0][k]) > 1e-6 || fabs(kid1.xoloc[0][k] - kid2.xoloc[1][k]) > 1e-6) { flag = 1; break; } } if(kid1.allele[0][kid1.n_xo[0]] != kid1.allele[1][kid1.n_xo[0]] || kid1.allele[0][kid1.n_xo[0]] != kid2.allele[0][kid1.n_xo[0]] || kid1.allele[0][kid1.n_xo[0]] != kid2.allele[1][kid1.n_xo[0]]) flag = 1; } else flag = 1; } if(!flag) break; /* done inbreeding */ /* go to next generation */ copy_individual(&kid1, &par1); if(selfing) copy_individual(&kid1, &par2); else copy_individual(&kid2, &par2); } /* end with inbreeding of this chromosome */ /* fill in alleles */ curseg = 0; for(k=0; k<n_mar[j]; k++) { /* loop over markers */ while(curseg < kid1.n_xo[0] && Map[j][k] > kid1.xoloc[0][curseg]) curseg++; OrigGeno[i][k+firstmarker[j]] = Ril[i][k+firstmarker[j]] = Cross[i][kid1.allele[0][curseg]-1]; /* simulate missing ? */ if(unif_rand() < missing_prob) { Ril[i][k+firstmarker[j]] = 0; } else if(n_str == 2 && unif_rand() < error_prob) { /* simulate error */ Ril[i][k+firstmarker[j]] = 3 - Ril[i][k+firstmarker[j]]; Errors[i][k+firstmarker[j]] = 1; } } } /* loop over chromosomes */ } /* loop over lines */ }
void calc_genoprob_special(int n_ind, int n_pos, int n_gen, int *geno, double *rf, double *rf2, double error_prob, double *genoprob, double initf(int, int *), double emitf(int, int, double, int *), double stepf(int, int, double, double, int *)) { int i, j, j2, v, v2, curpos; double s, **alpha, **beta; int **Geno; double ***Genoprob; int cross_scheme[2]; /* cross scheme hidden in genoprob argument; used by hmm_bcsft */ cross_scheme[0] = genoprob[0]; cross_scheme[1] = genoprob[1]; genoprob[0] = 0.0; genoprob[1] = 0.0; /* allocate space for alpha and beta and reorganize geno and genoprob */ reorg_geno(n_ind, n_pos, geno, &Geno); reorg_genoprob(n_ind, n_pos, n_gen, genoprob, &Genoprob); allocate_alpha(n_pos, n_gen, &alpha); allocate_alpha(n_pos, n_gen, &beta); for(i=0; i<n_ind; i++) { /* i = individual */ for(curpos=0; curpos < n_pos; curpos++) { if(!Geno[curpos][i]) continue; R_CheckUserInterrupt(); /* check for ^C */ /* initialize alpha and beta */ for(v=0; v<n_gen; v++) { if(curpos==0) alpha[v][0] = initf(v+1, cross_scheme) + emitf(Geno[0][i], v+1, error_prob, cross_scheme); else alpha[v][0] = initf(v+1, cross_scheme) + emitf(Geno[0][i], v+1, TOL, cross_scheme); beta[v][n_pos-1] = 0.0; } /* forward-backward equations */ for(j=1,j2=n_pos-2; j<n_pos; j++, j2--) { for(v=0; v<n_gen; v++) { alpha[v][j] = alpha[0][j-1] + stepf(1, v+1, rf[j-1], rf2[j-1], cross_scheme); if(curpos==j2+1) beta[v][j2] = beta[0][j2+1] + stepf(v+1,1,rf[j2], rf2[j2], cross_scheme) + emitf(Geno[j2+1][i],1,error_prob, cross_scheme); else beta[v][j2] = beta[0][j2+1] + stepf(v+1,1,rf[j2], rf2[j2], cross_scheme) + emitf(Geno[j2+1][i],1,TOL, cross_scheme); for(v2=1; v2<n_gen; v2++) { alpha[v][j] = addlog(alpha[v][j], alpha[v2][j-1] + stepf(v2+1,v+1,rf[j-1],rf2[j-1], cross_scheme)); if(curpos==j2+1) beta[v][j2] = addlog(beta[v][j2], beta[v2][j2+1] + stepf(v+1,v2+1,rf[j2],rf2[j2], cross_scheme) + emitf(Geno[j2+1][i],v2+1,error_prob, cross_scheme)); else beta[v][j2] = addlog(beta[v][j2], beta[v2][j2+1] + stepf(v+1,v2+1,rf[j2],rf2[j2], cross_scheme) + emitf(Geno[j2+1][i],v2+1,TOL, cross_scheme)); } if(curpos==j) alpha[v][j] += emitf(Geno[j][i],v+1,error_prob, cross_scheme); else alpha[v][j] += emitf(Geno[j][i],v+1,TOL, cross_scheme); } } /* calculate genotype probabilities */ s = Genoprob[0][curpos][i] = alpha[0][curpos] + beta[0][curpos]; for(v=1; v<n_gen; v++) { Genoprob[v][curpos][i] = alpha[v][curpos] + beta[v][curpos]; s = addlog(s, Genoprob[v][curpos][i]); } for(v=0; v<n_gen; v++) Genoprob[v][curpos][i] = exp(Genoprob[v][curpos][i] - s); } /* end loop over current position */ } /* loop over individuals */ }
void R_mqmaugment(int *geno, double *dist, double *pheno, int *auggeno, double *augPheno, int *augIND, int *Nind, int *Naug, int *Nmark, int *Npheno, int *maxind, int *maxiaug, double *minprob, int *chromo, int *rqtlcrosstypep, int *augment_strategy, int *verbosep) { int **Geno; double **Pheno; double **Dist; int **NEW; //Holds the output for the augmentdata function int **Chromo; double **NEWPheno; //New phenotype vector int **NEWIND; //New list of individuals const int nind0 = *Nind; //Individuals we start with const int verbose = *verbosep; const RqtlCrossType rqtlcrosstype = (RqtlCrossType) *rqtlcrosstypep; if(verbose) Rprintf("INFO: Starting C-part of the data augmentation routine\n"); ivector new_ind; MQMMarkerMatrix markers = newMQMMarkerMatrix(*Nmark, nind0); vector mapdistance = newvector(*Nmark); ivector chr = newivector(*Nmark); //Reorganise the pointers into arrays, Singletons are just cast into the function reorg_geno(nind0, *Nmark, geno, &Geno); reorg_int(*Nmark, 1, chromo, &Chromo); reorg_pheno(nind0, *Npheno, pheno, &Pheno); reorg_pheno(*Nmark, 1, dist, &Dist); reorg_int(*maxind, *Nmark, auggeno, &NEW); reorg_int((*maxiaug)*nind0, 1, augIND, &NEWIND); reorg_pheno((*maxiaug)*nind0, 1, augPheno, &NEWPheno); MQMCrossType crosstype = determine_MQMCross(*Nmark, *Nind, (const int **)Geno, rqtlcrosstype); // Determine cross change_coding(Nmark, Nind, Geno, markers, crosstype); // Change all the markers from R/qtl format to MQM internal if(verbose) Rprintf("INFO: Filling the chromosome matrix\n"); for (int i=0; i<(*Nmark); i++) { //Set some general information structures per marker mapdistance[i] = POSITIONUNKNOWN; mapdistance[i] = Dist[0][i]; chr[i] = Chromo[0][i]; } if(mqmaugmentfull(&markers,Nind,Naug,&new_ind,*minprob, *maxind, *maxiaug,&Pheno,*Nmark,chr,mapdistance,*augment_strategy,crosstype,verbose)){ //Data augmentation finished succesfully, encode it back into RQTL format for (int i = 0; i<(*Nmark); i++) { for (int j = 0; j<(*Naug); j++) { //Rprintf("INFO: Phenotype after return: %f",NEWPheno[0][j]); NEWPheno[0][j] = Pheno[0][j]; NEWIND[0][j] = new_ind[j]; NEW[i][j] = 9; if (markers[i][j] == MAA) { NEW[i][j] = 1; } if (markers[i][j] == MH) { NEW[i][j] = 2; } if (markers[i][j] == MBB) { // [karl:] this might need to be changed for RIL crosstype==CRIL ? NEW[i][j]=2 : NEW[i][j] = 3; //[Danny:] This should solve it } if (markers[i][j] == MNOTAA) { NEW[i][j] = 5; } if (markers[i][j] == MNOTBB) { NEW[i][j] = 4; } } } if (verbose) { Rprintf("# Unique individuals before augmentation:%d\n", nind0); Rprintf("# Unique selected individuals:%d\n", *Nind); Rprintf("# Marker p individual:%d\n", *Nmark); Rprintf("# Individuals after augmentation:%d\n", *Naug); Rprintf("INFO: Data augmentation succesfull\n"); } } else { //Unsuccessfull data augmentation exit Rprintf("INFO: This code should not be reached, data corruption could have occured. Please re-run this analysis.\n"); *Naug = nind0; for (int i=0; i<(*Nmark); i++) { for (int j=0; j<(*Naug); j++) { NEWPheno[0][j] = Pheno[0][j]; NEW[i][j] = 9; if (markers[i][j] == MAA) { NEW[i][j] = 1; } if (markers[i][j] == MH) { NEW[i][j] = 2; } if (markers[i][j] == MBB) { // [Karl:] this might need to be changed for RIL crosstype==CRIL ? NEW[i][j]=2 : NEW[i][j] = 3; // [Danny:] This should solve it } if (markers[i][j] == MNOTAA) { NEW[i][j] = 5; } if (markers[i][j] == MNOTBB) { NEW[i][j] = 4; } } } fatal("Data augmentation failed", ""); } delMQMMarkerMatrix(markers,*Nmark); // [Danny:] This looked suspicious, we were leaking memory here because we didn't clean it Free(mapdistance); Free(chr); return; }
void est_map_bci(int n_ind, int n_mar, int *geno, double *d, int m, double p, double error_prob, double *loglik, int maxit, double tol, int verbose) { int i, j, j2, v, v2, it, flag=0, **Geno, n_states; double s, **alpha, **beta, **gamma, *cur_d, *rf; double ***tm, *temp; double curloglik; double initprob; int ndigits; double maxdif, tempdif; char pattern[100], text[200]; n_states = 2*(m+1); initprob = -log((double)n_states); /* allocate space for beta and reorganize geno */ reorg_geno(n_ind, n_mar, geno, &Geno); allocate_alpha(n_mar, n_states, &alpha); allocate_alpha(n_mar, n_states, &beta); allocate_dmatrix(n_states, n_states, &gamma); allocate_double(n_mar-1, &cur_d); allocate_double(n_mar-1, &rf); /* allocate space for the transition matrices */ /* size n_states x n_states x (n_mar-1) */ /* tm[state1][state2][interval] */ tm = (double ***)R_alloc(n_states, sizeof(double **)); tm[0] = (double **)R_alloc(n_states * n_states, sizeof(double *)); for(i=1; i<n_states; i++) tm[i] = tm[i-1] + n_states; tm[0][0] = (double *)R_alloc(n_states * n_states * (n_mar - 1), sizeof(double)); temp = tm[0][0]; for(i=0; i < n_states; i++) { for(j=0; j < n_states; j++) { tm[i][j] = temp; temp += n_mar-1; } } /* digits in verbose output */ if(verbose) { ndigits = (int)ceil(-log10(tol)); if(ndigits > 16) ndigits=16; sprintf(pattern, "%s%d.%df", "%", ndigits+3, ndigits+1); } for(j=0; j<n_mar-1; j++) d[j] /= 100.0; /* convert to Morgans */ /* begin EM algorithm */ for(it=0; it<maxit; it++) { for(j=0; j<n_mar-1; j++) { cur_d[j] = d[j]; rf[j] = 0.0; } /* calculate the transition matrices */ step_bci(n_mar, n_states, tm, cur_d, m, p, maxit, tol); for(i=0; i<n_ind; i++) { /* i = individual */ R_CheckUserInterrupt(); /* check for ^C */ /* initialize alpha and beta */ for(v=0; v<n_states; v++) { alpha[v][0] = initprob + emit_bci(Geno[0][i], v, error_prob, m); beta[v][n_mar-1] = 0.0; } /* forward-backward equations */ for(j=1,j2=n_mar-2; j<n_mar; j++, j2--) { for(v=0; v<n_states; v++) { alpha[v][j] = alpha[0][j-1] + tm[0][v][j-1]; beta[v][j2] = beta[0][j2+1] + tm[v][0][j2] + emit_bci(Geno[j2+1][i], 0, error_prob, m); for(v2=1; v2<n_states; v2++) { alpha[v][j] = addlog(alpha[v][j], alpha[v2][j-1] + tm[v2][v][j-1]); beta[v][j2] = addlog(beta[v][j2], beta[v2][j2+1] + tm[v][v2][j2] + emit_bci(Geno[j2+1][i], v2, error_prob, m)); } alpha[v][j] += emit_bci(Geno[j][i], v, error_prob, m); } } for(j=0; j<n_mar-1; j++) { /* calculate gamma = log Pr(v1, v2, O) */ for(v=0, s=0.0; v<n_states; v++) { for(v2=0; v2<n_states; v2++) { gamma[v][v2] = alpha[v][j] + beta[v2][j+1] + emit_bci(Geno[j+1][i], v2, error_prob, m) + tm[v][v2][j]; if(v==0 && v2==0) s = gamma[v][v2]; else s = addlog(s, gamma[v][v2]); } } for(v=0; v<n_states; v++) { for(v2=0; v2<n_states; v2++) { rf[j] += nrec_bci(v, v2, m) * exp(gamma[v][v2] - s); } } } } /* loop over individuals */ /* rescale */ for(j=0; j<n_mar-1; j++) { rf[j] /= (double)n_ind; if(rf[j] < tol/100.0) rf[j] = tol/100.0; else if(rf[j] > 0.5-tol/100.0) rf[j] = 0.5-tol/100.0; } /* use map function to convert back to distances */ for(j=0; j<n_mar-1; j++) d[j] = imf_stahl(rf[j], m, p, 1e-10, 1000); if(verbose>1) { /* print estimates as we go along*/ Rprintf(" %4d ", it+1); maxdif=0.0; for(j=0; j<n_mar-1; j++) { tempdif = fabs(d[j] - cur_d[j])/(cur_d[j]+tol*100.0); if(maxdif < tempdif) maxdif = tempdif; } sprintf(text, "%s%s\n", " max rel've change = ", pattern); Rprintf(text, maxdif); } /* check convergence */ for(j=0, flag=0; j<n_mar-1; j++) { if(fabs(d[j] - cur_d[j]) > tol*(cur_d[j]+tol*100.0)) { flag = 1; break; } } if(!flag) break; } /* end EM algorithm */ if(flag) warning("Didn't converge!\n"); /* re-calculate transition matrices */ step_bci(n_mar, n_states, tm, d, m, p, maxit, tol); /* calculate log likelihood */ *loglik = 0.0; for(i=0; i<n_ind; i++) { /* i = individual */ R_CheckUserInterrupt(); /* check for ^C */ /* initialize alpha */ for(v=0; v<n_states; v++) alpha[v][0] = initprob + emit_bci(Geno[0][i], v, error_prob, m); /* forward equations */ for(j=1; j<n_mar; j++) { for(v=0; v<n_states; v++) { alpha[v][j] = alpha[0][j-1] + tm[0][v][j-1]; for(v2=1; v2<n_states; v2++) alpha[v][j] = addlog(alpha[v][j], alpha[v2][j-1] + tm[v2][v][j-1]); alpha[v][j] += emit_bci(Geno[j][i], v, error_prob, m); } } curloglik = alpha[0][n_mar-1]; for(v=1; v<n_states; v++) curloglik = addlog(curloglik, alpha[v][n_mar-1]); *loglik += curloglik; } if(verbose) { if(verbose < 2) { /* print final estimates */ Rprintf(" no. iterations = %d\n", it+1); maxdif=0.0; for(j=0; j<n_mar-1; j++) { tempdif = fabs(d[j] - cur_d[j])/(cur_d[j]+tol*100.0); if(maxdif < tempdif) maxdif = tempdif; } sprintf(text, "%s%s\n", " max rel've change at last step = ", pattern); Rprintf(text, maxdif); } Rprintf(" loglik: %10.4lf\n\n", *loglik); } /* convert distances back to cM */ for(j=0; j<n_mar-1; j++) d[j] *= 100.0; }
void calc_pairprob(int n_ind, int n_pos, int n_gen, int *geno, double *rf, double *rf2, double error_prob, double *genoprob, double *pairprob, double initf(int), double emitf(int, int, double), double stepf(int, int, double, double)) { int i, j, j2, v, v2, v3; double s=0.0, **alpha, **beta; int **Geno; double ***Genoprob, *****Pairprob; /* n_pos must be at least 2, or there are no pairs! */ if(n_pos < 2) error("n_pos must be > 1 in calc_pairprob"); /* allocate space for alpha and beta and reorganize geno, genoprob, and pairprob */ reorg_geno(n_ind, n_pos, geno, &Geno); reorg_genoprob(n_ind, n_pos, n_gen, genoprob, &Genoprob); reorg_pairprob(n_ind, n_pos, n_gen, pairprob, &Pairprob); allocate_alpha(n_pos, n_gen, &alpha); allocate_alpha(n_pos, n_gen, &beta); for(i=0; i<n_ind; i++) { /* i = individual */ R_CheckUserInterrupt(); /* check for ^C */ /* initialize alpha and beta */ for(v=0; v<n_gen; v++) { alpha[v][0] = initf(v+1) + emitf(Geno[0][i], v+1, error_prob); beta[v][n_pos-1] = 0.0; } /* forward-backward equations */ for(j=1,j2=n_pos-2; j<n_pos; j++, j2--) { for(v=0; v<n_gen; v++) { alpha[v][j] = alpha[0][j-1] + stepf(1, v+1, rf[j-1], rf2[j-1]); beta[v][j2] = beta[0][j2+1] + stepf(v+1,1,rf[j2], rf2[j2]) + emitf(Geno[j2+1][i],1,error_prob); for(v2=1; v2<n_gen; v2++) { alpha[v][j] = addlog(alpha[v][j], alpha[v2][j-1] + stepf(v2+1,v+1,rf[j-1],rf2[j-1])); beta[v][j2] = addlog(beta[v][j2], beta[v2][j2+1] + stepf(v+1,v2+1,rf[j2],rf2[j2]) + emitf(Geno[j2+1][i],v2+1,error_prob)); } alpha[v][j] += emitf(Geno[j][i],v+1,error_prob); } } /* calculate genotype probabilities */ for(j=0; j<n_pos; j++) { s = Genoprob[0][j][i] = alpha[0][j] + beta[0][j]; for(v=1; v<n_gen; v++) { Genoprob[v][j][i] = alpha[v][j] + beta[v][j]; s = addlog(s, Genoprob[v][j][i]); } for(v=0; v<n_gen; v++) Genoprob[v][j][i] = exp(Genoprob[v][j][i] - s); } /* calculate Pr(G[j], G[j+1] | marker data) for i = 1...n_pos-1 */ for(j=0; j<n_pos-1; j++) { for(v=0; v<n_gen; v++) { for(v2=0; v2<n_gen; v2++) { Pairprob[v][v2][j][j+1][i] = alpha[v][j] + beta[v2][j+1] + stepf(v+1,v2+1,rf[j],rf2[j]) + emitf(Geno[j+1][i],v2+1,error_prob); if(v==0 && v2==0) s=Pairprob[v][v2][j][j+1][i]; else s = addlog(s,Pairprob[v][v2][j][j+1][i]); } } /* scale to sum to 1 */ for(v=0; v<n_gen; v++) for(v2=0; v2<n_gen; v2++) Pairprob[v][v2][j][j+1][i] = exp(Pairprob[v][v2][j][j+1][i] - s); } /* now calculate Pr(G[i], G[j] | marker data) for j > i+1 */ for(j=0; j<n_pos-2; j++) { for(j2=j+2; j2<n_pos; j2++) { for(v=0; v<n_gen; v++) { /* genotype at pos'n j */ for(v2=0; v2<n_gen; v2++) { /* genotype at pos'n j2 */ Pairprob[v][v2][j][j2][i] = 0.0; for(v3=0; v3<n_gen; v3++) { /* genotype at pos'n j2-1 */ s = Genoprob[v3][j2-1][i]; if(fabs(s) > TOL) /* avoid 0/0 */ Pairprob[v][v2][j][j2][i] += Pairprob[v][v3][j][j2-1][i]* Pairprob[v3][v2][j2-1][j2][i]/s; } } } /* end loops over genotypes */ } } /* end loops over pairs of positions */ } /* end loop over individuals */ }
void est_rf(int n_ind, int n_mar, int *geno, double *rf, double erec(int, int, double), double logprec(int, int, double), int maxit, double tol, int meioses_per) { int i, j1, j2, s, **Geno, n_mei=0, flag=0; double **Rf, next_rf=0.0, cur_rf=0.0; /* reorganize geno and rf */ reorg_geno(n_ind, n_mar, geno, &Geno); reorg_errlod(n_mar, n_mar, rf, &Rf); for(j1=0; j1<n_mar; j1++) { /* count number of meioses */ for(i=0, n_mei=0; i<n_ind; i++) if(Geno[j1][i] != 0) n_mei += meioses_per; Rf[j1][j1] = (double) n_mei; R_CheckUserInterrupt(); /* check for ^C */ for(j2=j1+1; j2<n_mar; j2++) { /* count meioses */ n_mei = flag = 0; for(i=0; i<n_ind; i++) { if(Geno[j1][i] != 0 && Geno[j2][i] != 0) { n_mei += meioses_per; /* check if informatve */ if(fabs(logprec(Geno[j1][i], Geno[j2][i], 0.5) - logprec(Geno[j1][i], Geno[j2][i], TOL)) > TOL) flag = 1; } } if(n_mei != 0 && flag == 1) { flag = 0; /* begin EM algorithm; start with cur_rf = 0.01 */ for(s=0, cur_rf=0.01; s < maxit; s++) { next_rf = 0.0; for(i=0; i<n_ind; i++) { if(Geno[j1][i] != 0 && Geno[j2][i] != 0) next_rf += erec(Geno[j1][i], Geno[j2][i], cur_rf); } next_rf /= (double) n_mei; if(fabs(next_rf - cur_rf) < tol*(cur_rf+tol*100.0)) { flag = 1; break; } cur_rf = next_rf; } if(!flag) warning("Markers (%d,%d) didn't converge\n", j1+1, j2+1); /* calculate LOD score */ Rf[j1][j2] = next_rf; Rf[j2][j1] = 0.0; for(i=0; i<n_ind; i++) { if(Geno[j1][i] != 0 && Geno[j2][i] != 0) { Rf[j2][j1] += logprec(Geno[j1][i],Geno[j2][i], next_rf); Rf[j2][j1] -= logprec(Geno[j1][i],Geno[j2][i], 0.5); } } Rf[j2][j1] /= log(10.0); } else { /* no informative meioses */ Rf[j1][j2] = NA_REAL; Rf[j2][j1] = 0.0; } } /* end loops over markers */ } }
void calc_genoprob(int n_ind, int n_pos, int n_gen, int *geno, double *rf, double *rf2, double error_prob, double *genoprob, double initf(int), double emitf(int, int, double), double stepf(int, int, double, double)) { int i, j, j2, v, v2; double s, **alpha, **beta; int **Geno; double ***Genoprob; /* allocate space for alpha and beta and reorganize geno and genoprob */ reorg_geno(n_ind, n_pos, geno, &Geno); reorg_genoprob(n_ind, n_pos, n_gen, genoprob, &Genoprob); allocate_alpha(n_pos, n_gen, &alpha); allocate_alpha(n_pos, n_gen, &beta); for(i=0; i<n_ind; i++) { /* i = individual */ R_CheckUserInterrupt(); /* check for ^C */ /* initialize alpha and beta */ for(v=0; v<n_gen; v++) { alpha[v][0] = initf(v+1) + emitf(Geno[0][i], v+1, error_prob); beta[v][n_pos-1] = 0.0; } /* forward-backward equations */ for(j=1,j2=n_pos-2; j<n_pos; j++, j2--) { for(v=0; v<n_gen; v++) { alpha[v][j] = alpha[0][j-1] + stepf(1, v+1, rf[j-1], rf2[j-1]); beta[v][j2] = beta[0][j2+1] + stepf(v+1,1,rf[j2], rf2[j2]) + emitf(Geno[j2+1][i],1,error_prob); for(v2=1; v2<n_gen; v2++) { alpha[v][j] = addlog(alpha[v][j], alpha[v2][j-1] + stepf(v2+1,v+1,rf[j-1],rf2[j-1])); beta[v][j2] = addlog(beta[v][j2], beta[v2][j2+1] + stepf(v+1,v2+1,rf[j2],rf2[j2]) + emitf(Geno[j2+1][i],v2+1,error_prob)); } alpha[v][j] += emitf(Geno[j][i],v+1,error_prob); } } /* calculate genotype probabilities */ for(j=0; j<n_pos; j++) { s = Genoprob[0][j][i] = alpha[0][j] + beta[0][j]; for(v=1; v<n_gen; v++) { Genoprob[v][j][i] = alpha[v][j] + beta[v][j]; s = addlog(s, Genoprob[v][j][i]); } for(v=0; v<n_gen; v++) Genoprob[v][j][i] = exp(Genoprob[v][j][i] - s); } /* the following is the old version */ /* for(j=0; j<n_pos; j++) { s = 0.0; for(v=0; v<n_gen; v++) s += (Genoprob[v][j][i] = exp(alpha[v][j] + beta[v][j])); for(v=0; v<n_gen; v++) Genoprob[v][j][i] /= s; } */ } /* loop over individuals */ }
void argmax_geno(int n_ind, int n_pos, int n_gen, int *geno, double *rf, double *rf2, double error_prob, int *argmax, double initf(int), double emitf(int, int, double), double stepf(int, int, double, double)) { int i, j, v, v2; double s, t, *gamma, *tempgamma, *tempgamma2; int **Geno, **Argmax, **traceback; /* Read R's random seed */ /* in the case of multiple "most likely" genotype sequences, we pick from them at random */ GetRNGstate(); /* allocate space and reorganize geno and argmax */ reorg_geno(n_ind, n_pos, geno, &Geno); reorg_geno(n_ind, n_pos, argmax, &Argmax); allocate_imatrix(n_pos, n_gen, &traceback); allocate_double(n_gen, &gamma); allocate_double(n_gen, &tempgamma); allocate_double(n_gen, &tempgamma2); for(i=0; i<n_ind; i++) { /* i = individual */ R_CheckUserInterrupt(); /* check for ^C */ /* begin viterbi algorithm */ if(n_pos > 1) { /* multiple markers */ for(v=0; v<n_gen; v++) gamma[v] = initf(v+1) + emitf(Geno[0][i], v+1, error_prob); for(j=0; j<n_pos-1; j++) { for(v=0; v<n_gen; v++) { tempgamma[v] = s = gamma[0] + stepf(1, v+1, rf[j], rf2[j]); traceback[j][v] = 0; for(v2=1; v2<n_gen; v2++) { t = gamma[v2] + stepf(v2+1, v+1, rf[j], rf2[j]); if(t > s || (fabs(t-s) < TOL && unif_rand() < 0.5)) { tempgamma[v] = s = t; traceback[j][v] = v2; } } tempgamma2[v] = tempgamma[v] + emitf(Geno[j+1][i], v+1, error_prob); } for(v=0; v<n_gen; v++) gamma[v] = tempgamma2[v]; } /* finish off viterbi and then traceback to get most likely sequence of genotypes */ Argmax[n_pos-1][i] = 0; s = gamma[0]; for(v=1; v<n_gen; v++) { if(gamma[v] > s || (fabs(gamma[v]-s) < TOL && unif_rand() < 0.5)) { s = gamma[v]; Argmax[n_pos-1][i] = v; } } for(j=n_pos-2; j >= 0; j--) Argmax[j][i] = traceback[j][Argmax[j+1][i]]; } else { /* for exactly one marker */ s = initf(1) + emitf(Geno[0][i], 1, error_prob); Argmax[0][i] = 0; for(v=1; v<n_gen; v++) { t = initf(v+1)+emitf(Geno[0][i], v+1, error_prob); if(t > s || (fabs(t-s) < TOL && unif_rand() < 0.5)) { s = t; Argmax[0][i] = v; } } } /* code genotypes as 1, 2, ... */ for(j=0; j<n_pos; j++) Argmax[j][i]++; } /* loop over individuals */ /* write R's random seed */ PutRNGstate(); }
void est_map(int n_ind, int n_mar, int n_gen, int *geno, double *rf, double *rf2, double error_prob, double initf(int), double emitf(int, int, double), double stepf(int, int, double, double), double nrecf1(int, int), double nrecf2(int, int), double *loglik, int maxit, double tol, int sexsp, int verbose) { int i, j, j2, v, v2, it, flag=0, **Geno, ndigits; double s, **alpha, **beta, **gamma, *cur_rf, *cur_rf2; double curloglik, maxdif, temp; char pattern[100], text[200]; /* allocate space for beta and reorganize geno */ reorg_geno(n_ind, n_mar, geno, &Geno); allocate_alpha(n_mar, n_gen, &alpha); allocate_alpha(n_mar, n_gen, &beta); allocate_dmatrix(n_gen, n_gen, &gamma); allocate_double(n_mar-1, &cur_rf); allocate_double(n_mar-1, &cur_rf2); /* digits in verbose output */ if(verbose) { ndigits = (int)ceil(-log10(tol)); if(ndigits > 16) ndigits=16; sprintf(pattern, "%s%d.%df", "%", ndigits+3, ndigits+1); } /* begin EM algorithm */ for(it=0; it<maxit; it++) { for(j=0; j<n_mar-1; j++) { cur_rf[j] = cur_rf2[j] = rf[j]; rf[j] = 0.0; if(sexsp) { cur_rf2[j] = rf2[j]; rf2[j] = 0.0; } } for(i=0; i<n_ind; i++) { /* i = individual */ R_CheckUserInterrupt(); /* check for ^C */ /* initialize alpha and beta */ for(v=0; v<n_gen; v++) { alpha[v][0] = initf(v+1) + emitf(Geno[0][i], v+1, error_prob); beta[v][n_mar-1] = 0.0; } /* forward-backward equations */ for(j=1,j2=n_mar-2; j<n_mar; j++, j2--) { for(v=0; v<n_gen; v++) { alpha[v][j] = alpha[0][j-1] + stepf(1, v+1, cur_rf[j-1], cur_rf2[j-1]); beta[v][j2] = beta[0][j2+1] + stepf(v+1,1,cur_rf[j2], cur_rf2[j2]) + emitf(Geno[j2+1][i],1,error_prob); for(v2=1; v2<n_gen; v2++) { alpha[v][j] = addlog(alpha[v][j], alpha[v2][j-1] + stepf(v2+1,v+1,cur_rf[j-1],cur_rf2[j-1])); beta[v][j2] = addlog(beta[v][j2], beta[v2][j2+1] + stepf(v+1,v2+1,cur_rf[j2],cur_rf2[j2]) + emitf(Geno[j2+1][i],v2+1,error_prob)); } alpha[v][j] += emitf(Geno[j][i],v+1,error_prob); } } for(j=0; j<n_mar-1; j++) { /* calculate gamma = log Pr(v1, v2, O) */ for(v=0, s=0.0; v<n_gen; v++) { for(v2=0; v2<n_gen; v2++) { gamma[v][v2] = alpha[v][j] + beta[v2][j+1] + emitf(Geno[j+1][i], v2+1, error_prob) + stepf(v+1, v2+1, cur_rf[j], cur_rf2[j]); if(v==0 && v2==0) s = gamma[v][v2]; else s = addlog(s, gamma[v][v2]); } } for(v=0; v<n_gen; v++) { for(v2=0; v2<n_gen; v2++) { rf[j] += nrecf1(v+1,v2+1) * exp(gamma[v][v2] - s); if(sexsp) rf2[j] += nrecf2(v+1,v2+1) * exp(gamma[v][v2] - s); } } } } /* loop over individuals */ /* rescale */ for(j=0; j<n_mar-1; j++) { rf[j] /= (double)n_ind; if(rf[j] < tol/1000.0) rf[j] = tol/1000.0; else if(rf[j] > 0.5-tol/1000.0) rf[j] = 0.5-tol/1000.0; if(sexsp) { rf2[j] /= (double)n_ind; if(rf2[j] < tol/1000.0) rf2[j] = tol/1000.0; else if(rf2[j] > 0.5-tol/1000.0) rf2[j] = 0.5-tol/1000.0; } else rf2[j] = rf[j]; } if(verbose>1) { /* print estimates as we go along*/ Rprintf(" %4d ", it+1); maxdif=0.0; for(j=0; j<n_mar-1; j++) { temp = fabs(rf[j] - cur_rf[j])/(cur_rf[j]+tol*100.0); if(maxdif < temp) maxdif = temp; if(sexsp) { temp = fabs(rf2[j] - cur_rf2[j])/(cur_rf2[j]+tol*100.0); if(maxdif < temp) maxdif = temp; } } sprintf(text, "%s%s\n", " max rel've change = ", pattern); Rprintf(text, maxdif); } /* check convergence */ for(j=0, flag=0; j<n_mar-1; j++) { if(fabs(rf[j] - cur_rf[j]) > tol*(cur_rf[j]+tol*100.0) || (sexsp && fabs(rf2[j] - cur_rf2[j]) > tol*(cur_rf2[j]+tol*100.0))) { flag = 1; break; } } if(!flag) break; } /* end EM algorithm */ if(flag) warning("Didn't converge!\n"); /* calculate log likelihood */ *loglik = 0.0; for(i=0; i<n_ind; i++) { /* i = individual */ /* initialize alpha */ for(v=0; v<n_gen; v++) alpha[v][0] = initf(v+1) + emitf(Geno[0][i], v+1, error_prob); /* forward equations */ for(j=1; j<n_mar; j++) { for(v=0; v<n_gen; v++) { alpha[v][j] = alpha[0][j-1] + stepf(1, v+1, rf[j-1], rf2[j-1]); for(v2=1; v2<n_gen; v2++) alpha[v][j] = addlog(alpha[v][j], alpha[v2][j-1] + stepf(v2+1,v+1,rf[j-1],rf2[j-1])); alpha[v][j] += emitf(Geno[j][i],v+1,error_prob); } } curloglik = alpha[0][n_mar-1]; for(v=1; v<n_gen; v++) curloglik = addlog(curloglik, alpha[v][n_mar-1]); *loglik += curloglik; } if(verbose) { if(verbose < 2) { /* print final estimates */ Rprintf(" no. iterations = %d\n", it+1); maxdif=0.0; for(j=0; j<n_mar-1; j++) { temp = fabs(rf[j] - cur_rf[j])/(cur_rf[j]+tol*100.0); if(maxdif < temp) maxdif = temp; if(sexsp) { temp = fabs(rf2[j] - cur_rf2[j])/(cur_rf2[j]+tol*100.0); if(maxdif < temp) maxdif = temp; } } sprintf(text, "%s%s\n", " max rel've change at last step = ", pattern); Rprintf(text, maxdif); } Rprintf(" loglik: %10.4lf\n\n", *loglik); } }