void Fisher::giveFinalReport(RecordOutputMgr *outputMgr) { double left, right, two; long long genomeSize = _context->getGenomeFile()->getGenomeSize(); if(_haveExclude){ genomeSize -= _excludeFile->getTotalFlattenedLength(); } // bases covered by neither long long n22_full_bases = genomeSize; //long long n22_bases = genomeSize - _queryUnion - _dbUnion + _intersectionVal; long double dMean = 1.0 + _dbUnion / (long double)_dbCounts; long double qMean = 1.0 + _queryUnion / (long double)_queryCounts; // heursitic, but seems to work quite well -- better than doing more intuitive sum then divide. long double bMean = (qMean + dMean); //bMean = (_unionVal + 2.0 * _intersectionVal) / (long double)(_dbCounts + _queryCounts); long long n11 = (long)_overlapCounts; // this could be < 0 because multiple overlaps long long n12 = (long)max(0L, (long)_queryCounts - (long)_overlapCounts); long long n21 = max(0L, (long)(_dbCounts - _overlapCounts)); long long n22_full = max(n21 + n12 + n11, (long long)(n22_full_bases / bMean)); long long n22 = max(0L, (long)(n22_full - n12 - n21 - n11)); printf("# Number of query intervals: %lu\n", _queryCounts); printf("# Number of db intervals: %lu\n", _dbCounts); printf("# Number of overlaps: %lu\n", _overlapCounts); printf("# Number of possible intervals (estimated): %lld\n", n22_full); printf("# phyper(%lld - 1, %lu, %lld - %lu, %lu, lower.tail=F)\n", n11, _queryCounts, n22_full, _queryCounts, _dbCounts); cout << "# Contingency Table Of Counts" << endl; printf("#_________________________________________\n"); printf("# | %-12s | %-12s |\n", " in -b", "not in -b"); printf("# in -a | %-12lld | %-12lld |\n", n11, n12); printf("# not in -a | %-12lld | %-12lld |\n", n21, n22); printf("#_________________________________________\n"); kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two); double ratio = ((double)n11 / (double)n12) / ((double)n21 / (double)n22); printf("# p-values for fisher's exact test\n"); printf("left\tright\ttwo-tail\tratio\n"); /* Some implementations report NAN as negative, some as positive. To ensure * we get consistent output from each compiler, we should do our own test. * Since the test script assumes "-nan", let's setle on that. */ if(std::isnan(ratio)) { printf("%.5g\t%.5g\t%.5g\t-nan\n", left, right, two); } else { printf("%.5g\t%.5g\t%.5g\t%.3f\n", left, right, two, ratio); } }
int main(int argc, char *argv[]) { char id[1024]; int n11, n12, n21, n22; double left, right, twotail, prob; while (scanf("%s%d%d%d%d", id, &n11, &n12, &n21, &n22) == 5) { prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail); printf("%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", id, n11, n12, n21, n22, prob, left, right, twotail); } return 0; }
static int test16_core(float anno[16], anno16_t *a) { extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); double left, right; int i; a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.; for (i=0; i<4; i++) a->d[i] = anno[i]; a->depth = anno[0] + anno[1] + anno[2] + anno[3]; a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0); if (a->depth == 0) return -1; a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499); kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]); for (i = 1; i < 4; ++i) a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i); return 0; }
bool Fisher::calculate() { if (!getFisher()) { return false; } // header cout << "# Contingency Table" << endl; // for fisher's exact test, we need the contingency table // XXXXXXXX | not in A | in A // not in B | n11: in neither | n12: only in A // in B | n21: only in B | n22: in A & B // double left, right, two; long long genomeSize = _context->getGenomeFile()->getGenomeSize(); // bases covered by neither a nor b long long n11 = genomeSize - _queryLen - _dbLen + _intersectionVal; // bases covered only by -a long long n12 = _queryLen - _intersectionVal; // bases covered only by -b long long n21 = _dbLen - _intersectionVal; // bases covered by both long long n22 = _intersectionVal; printf("#_________________________________________\n"); printf("# | %-12s | %-12s |\n", "not in -b", "in -b"); printf("# not in -a | %-12lld | %-12lld |\n", n11, n12); printf("# in -a | %-12lld | %-12lld |\n", n21, n22); printf("#_________________________________________\n"); kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two); double ratio = ((double)n11 / (double)n12) / ((double)n21 / (double)n22); printf("# p-values for fisher's exact test\n"); printf("left\tright\ttwo-tail\tratio\n"); printf("%.5f\t%.5f\t%.5f\t%.3f\n", left, right, two, ratio); //kt_fisher_exact(50010000, 10000000, 15000000, 3000000, &left, &right, &two); return true; }
double sb3(int cnts[4]){ double left,right,twotail,prob; kt_fisher_exact(cnts[0], cnts[1], cnts[2], cnts[3], &left, &right, &twotail); return twotail; }
void analysis(dat &d,int nThreads,int nJack) { int *rowSum = new int[d.cn.size()]; int *rowMax = new int[d.cn.size()]; int *rowMaxW = new int[d.cn.size()]; int *error1 = new int[d.cn.size()];//number of non most frequent observed bases int *error2 = new int[d.cn.size()];//sampled size_t mat1[4]={0,0,0,0};//matrix used for fisher for method 1 size_t mat2[4]={0,0,0,0};//matrix used for fisher for method 2 size_t tab[2] = {0,0};//used for debug for(int i=0;i<d.cn.size();i++) { int s =d.cn[i][0]; int max=s; int which=0; for(int j=1;j<4;j++){ s += d.cn[i][j]; if(d.cn[i][j]>max){ max=d.cn[i][j]; which=j; } } rowSum[i] = s; rowMax[i]=max; rowMaxW[i]=which; aMap::iterator it= d.myMap.find(d.pos[i]); if(it!=d.myMap.end()){//if site is hapmap site // fprintf(stderr,"posi:%d wmax:%d all1:%d freq:%f\n",it->first,rowMaxW[i],it->second.allele1,it->second.freq); //if maximum occuring bases is the same as allele1 from hapmap, then set freq to 1-freq if(rowMaxW[i]==it->second.allele1) //it->first C++ syntax for getting key of iterator //it->second C++ syntax for getting value of key of iterator, key->value: key=pos,value=hapSite it->second.freq=1-it->second.freq; else it->second.freq=it->second.freq; // fprintf(stderr,"posi:%d wmax:%d all1:%d freq:%f\n",it->first,rowMaxW[i],it->second.allele1,it->second.freq); // exit(0); } error1[i] = rowSum[i]-rowMax[i]; error2[i] = simrbinom((1.0*error1[i])/(1.0*rowSum[i])); // fprintf(stdout,"simrbinom\t%d %d %d %d %d %d\n",rowSum[i],rowMax[i],rowMaxW[i],error1[i],error2[i],d.dist[i]); if(error1[i]>0) tab[1]++; else tab[0]++; if(d.dist[i]==0){//this is a snpsite mat1[0] +=error1[i]; mat1[1] +=rowSum[i]-error1[i]; mat2[0] +=error2[i]; mat2[1] +=1-error2[i]; // fprintf(stdout,"rs %d %d %d %d %d %d %d %f %d %d\n",d.pos[i],rowSum[i],rowMax[i],rowMaxW[i],error1[i],error2[i],d.dist[i],it->second.freq,it->second.allele1,it->second.allele2); }else{ mat1[2] +=error1[i]; mat1[3] +=rowSum[i]-error1[i]; mat2[2] += error2[i]; mat2[3] += 1-error2[i]; } } #if 0 fprintf(stderr,"tab:%lu %lu\n",tab[0],tab[1]); fprintf(stderr,"mat: %lu %lu %lu %lu\n",mat1[0],mat1[1],mat1[2],mat1[3]); fprintf(stderr,"mat2: %lu %lu %lu %lu\n",mat2[0],mat2[1],mat2[2],mat2[3]); #endif int n11, n12, n21, n22; double left, right, twotail, prob; // fprintf(stderr,"--------\nMAIN RESULTS: Fisher exact test:\n"); n11=mat1[0];n12=mat1[2];n21=mat1[1];n22=mat1[3]; prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail); // fprintf(stdout,"Method\t n11 n12 n21 n22 prob left right twotail\n"); //fprintf(stdout,"%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", "method1", n11, n12, n21, n22, // prob, left, right, twotail); n11=mat2[0];n12=mat2[2];n21=mat2[1];n22=mat2[3]; prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail); //fprintf(stdout,"%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", "method2", n11, n12, n21, n22, // prob, left, right, twotail); //estimate how much contamination double c= mat1[2]/(1.0*(mat1[2]+mat1[3]));//this is error for flanking site double err= mat1[0]/(1.0*(mat1[0]+mat1[1]));//this is error for snpsite fprintf(stderr,"Mismatch_rate_for_flanking:%f MisMatch_rate_for_snpsite:%f \n",c,err); int *err0 =new int[d.cn.size()/9];//<-nbases of non frequent most occuring at snpsite int *err1 =new int[d.cn.size()/9];//<-nbases of non frequent most occuring at flanking int *d0 =new int[d.cn.size()/9];//<-seqdepth for snpsite int *d1 =new int[d.cn.size()/9];//<-seqdepth for flanking double *freq =new double[d.cn.size()/9];//<- freq for snpsite for(int i=0;i<d.cn.size()/9;i++){ int adj=0; int dep=0; for(int j=0;j<9;j++){ if(d.dist[i*9+j]!=0){//<- flanking adj += error1[i*9+j]; dep += rowSum[i*9+j]; }else{ //snpsite err0[i] = error1[i*9+j]; d0[i] = rowSum[i*9+j]; freq[i] =d.myMap.find(d.pos[i*9+j])->second.freq; } } err1[i] =adj; d1[i] = dep; #if 0 if(it==d.myMap.end()){ fprintf(stderr,"Problem finding:%d\n",d.pos[i]); exit(0); } #endif // fprintf(stdout,"cont\t%d\t%d\t%d\t%d\t%f\n",err0[i],err1[i],d0[i],d1[i],freq[i]); } allPars ap; ap.len=d.cn.size()/9; ap.seqDepth = d0; ap.nonMajor = err0; ap.freq = freq; ap.eps = c; ap.newllh =0; ap.e1 = err1; ap.d1=d1; double mom,momJack,ML,mlJack,val; ap.newllh =0; mom= likeOldMom(d.cn.size()/9,d0,err0,freq,c,-1); momJack = jackMom(&ap,nJack); tpars tp;tp.ap=≈tp.skip=-1; // print(tp.ap,"asdff1"); kmin_brent(myfun,1e-6,0.5-1e-6,&tp,0.0001,&ML); mlJack= jackML(&ap,nThreads,NULL,nJack); fprintf(stderr,"\nMethod1: old_llh Version: MoM:%f SE(MoM):%e ML:%f SE(ML):%e",mom,momJack,ML,mlJack); ap.newllh =1; mom=likeNewMom(d.cn.size()/9,d0,err0,freq,c,-1); momJack= jackMom(&ap,nJack); //marshall(&ap,"prem1"); val=kmin_brent(myfun,1e-6,0.5-1e-6,&tp,0.0001,&ML); // fprintf(stderr,"\nM1: ML:%f VAL:%f\n",ML,val); mlJack= jackML(&ap,nThreads,NULL,nJack); fprintf(stderr,"\nMethod1: new_llh Version: MoM:%f SE(MoM):%e ML:%f SE(ML):%e",mom,momJack,ML,mlJack); // fread(error2,sizeof(int),d.cn.size(),fopen("error2.bin","rb")); //for(int i=0;0&&i<d.cn.size();i++) // fprintf(stdout,"pik\t%d\n",error2[i]); //exit(0); for(int i=0;i<d.cn.size()/9;i++){ int adj=0; for(int j=0;j<9;j++){ if(d.dist[i*9+j]!=0){ adj += error2[i*9+j]; }else{ err0[i] = error2[i*9+j]; freq[i] =d.myMap.find(d.pos[i*9+j])->second.freq; // fprintf(stderr,"freq:%f\n",freq[i]); } } err1[i] =adj; d0[i] = 1; d1[i] = 8; #if 0 if(it==d.myMap.end()){ fprintf(stderr,"Problem finding:%d\n",d.pos[i]); exit(0); } #endif // fprintf(stdout,"cont\t%d\t%d\t%d\t%d\t%f\n",err0[i],err1[i],d0[i],d1[i],freq[i]); } ap.seqDepth = d0; ap.nonMajor = err0; ap.e1=err1; ap.d1=d1; ap.freq = freq; ap.newllh =0; mom= likeOldMom(d.cn.size()/9,d0,err0,freq,c,-1); momJack = jackMom(&ap,nJack); //print(tp.ap,"asdff2"); // exit(0); val = kmin_brent(myfun,1e-6,0.5-1e-6,&tp,0.0001,&ML); //fprintf(stderr,"\nML2:%f VAL:%f\n",ML,val); // exit(0); //FILE *fp = fopen("heyaa","w"); print(&ap,fp);fclose(fp); //return; mlJack= jackML(&ap,nThreads,NULL,nJack); fprintf(stderr,"\nMethod2: old_llh Version: MoM:%f SE(MoM):%e ML:%f SE(ML):%e",mom,momJack,ML,mlJack); ap.newllh =1; mom=likeNewMom(d.cn.size()/9,d0,err0,freq,c,-1); momJack= jackMom(&ap,nJack); kmin_brent(myfun,1e-6,0.5-1e-6,&tp,0.0001,&ML); mlJack= jackML(&ap,nThreads,NULL,nJack); fprintf(stderr,"\nMethod2: new_llh Version: MoM:%f SE(MoM):%e ML:%f SE(ML):%e\n",mom,momJack,ML,mlJack); delete [] rowSum; delete [] rowMax; delete [] rowMaxW; delete [] error1; delete [] error2; delete [] err0; delete [] err1; delete [] d0; delete [] d1; delete [] freq; }
int main(int argc, char **argv) { WAH_SIZE = 32; WAH_MAX_FILL_WORDS = (1<<(WAH_SIZE-1)) - 1; uint32_t num_chrms = 100; if ((argc != 4)) { errx(1, "usage:\t%s <input file> <index dir> <w|i>", argv[0]); } double genome_size = 3095677412.0; char *input_file = argv[1]; char *index_dir = argv[2]; char *i_type = argv[3]; struct input_file *in_f = input_file_init(input_file); int chrm_len = 50; char *chrm = (char *)malloc(chrm_len*sizeof(char)); uint32_t start, end; long offset; struct giggle_index *gi; gi = giggle_load(index_dir, uint32_t_ll_giggle_set_data_handler); uint32_t *file_counts = (uint32_t *) calloc(gi->file_index->num, sizeof(uint32_t)); uint32_t num_intervals = 0; double mean_interval_size = 0.0; while ( in_f->input_file_get_next_interval(in_f, &chrm, &chrm_len, &start, &end, &offset) >= 0 ) { num_intervals += 1; mean_interval_size += end - start; struct uint32_t_ll *R = (struct uint32_t_ll *)giggle_query_region(gi, chrm, start, end); if (R != NULL) { struct uint32_t_ll_node *curr = R->head; while (curr != NULL) { /* struct file_id_offset_pair *fid_off = (struct file_id_offset_pair *) unordered_list_get(gi->offset_index, curr->val); */ struct file_id_offset_pair fid_off = gi->offset_index->vals[curr->val]; struct file_data *fd = (struct file_data *) unordered_list_get(gi->file_index, fid_off.file_id); file_counts[fid_off.file_id] += 1; curr = curr->next; } uint32_t_ll_free((void **)&R); } } mean_interval_size = mean_interval_size/num_intervals; struct doubles_uint32_t_tuple *sig = (struct doubles_uint32_t_tuple *) calloc(gi->file_index->num, sizeof(struct doubles_uint32_t_tuple)); uint32_t i; for (i = 0; i < gi->file_index->num; ++i) { struct file_data *fd = (struct file_data *) unordered_list_get(gi->file_index, i); long long n11 = (long long)(file_counts[i]); long long n12 = (long long)(MAX(0,num_intervals - file_counts[i])); long long n21 = (long long)(MAX(0,fd->num_intervals - file_counts[i])); double comp_mean = ((fd->mean_interval_size+mean_interval_size)); long long n22_full = (long long) MAX(n11 + n12 + n21, genome_size/comp_mean); long long n22 = MAX(0, n22_full - (n11 + n12 + n21)); double left, right, two; double r = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two); double ratio = (((double)n11/(double)n12) / ((double)n21/(double)n22)); //fprintf(stderr, "%s\t%f\n", fd->file_name, two); sig[i].d1 = right; sig[i].d2 = ratio; sig[i].u1 = i; sig[i].u2 = file_counts[i]; } qsort(sig, gi->file_index->num, sizeof(struct doubles_uint32_t_tuple), doubles_uint32_t_tuple_cmp); for (i = 0; i < gi->file_index->num; ++i) { struct file_data *fd = (struct file_data *) unordered_list_get(gi->file_index, sig[i].u1); /* printf("%s\t" "right:%f\t" "%f\n", fd->file_name, sig[i].d1, sig[i].d2); */ printf( "sig:%f\t" "size:%u\t" "overlap:%u\t" "ratio:%f\t" "%s\n", sig[i].d1, fd->num_intervals, sig[i].u2, sig[i].d2, fd->file_name); } giggle_index_destroy(&gi); cache.destroy(); }