Exemple #1
0
void Fisher::giveFinalReport(RecordOutputMgr *outputMgr)
{
    double left, right, two;

    long long genomeSize = _context->getGenomeFile()->getGenomeSize();
    if(_haveExclude){
        genomeSize -= _excludeFile->getTotalFlattenedLength();
    }
    // bases covered by neither
    long long n22_full_bases = genomeSize;
    //long long n22_bases = genomeSize - _queryUnion - _dbUnion + _intersectionVal;
    long double dMean = 1.0 + _dbUnion / (long double)_dbCounts;
    long double qMean = 1.0 + _queryUnion / (long double)_queryCounts;

    // heursitic, but seems to work quite well -- better than doing more intuitive sum then divide.
    long double bMean = (qMean + dMean);
    //bMean = (_unionVal + 2.0 * _intersectionVal) / (long double)(_dbCounts + _queryCounts);

    long long n11 = (long)_overlapCounts;
    // this could be < 0 because multiple overlaps
    long long n12 = (long)max(0L, (long)_queryCounts - (long)_overlapCounts);
    long long n21 = max(0L, (long)(_dbCounts - _overlapCounts));
    long long n22_full = max(n21 + n12 + n11, (long long)(n22_full_bases / bMean));
    long long n22 = max(0L, (long)(n22_full - n12 - n21 - n11));

    printf("# Number of query intervals: %lu\n", _queryCounts);
    printf("# Number of db intervals: %lu\n", _dbCounts);
    printf("# Number of overlaps: %lu\n", _overlapCounts);
    printf("# Number of possible intervals (estimated): %lld\n", n22_full);

    printf("# phyper(%lld - 1, %lu, %lld - %lu, %lu, lower.tail=F)\n", n11, _queryCounts, n22_full, _queryCounts, _dbCounts);
    cout << "# Contingency Table Of Counts" << endl;
    printf("#_________________________________________\n");
    printf("#           | %-12s | %-12s |\n", " in -b", "not in -b");
    printf("#     in -a | %-12lld | %-12lld |\n", n11, n12);
    printf("# not in -a | %-12lld | %-12lld |\n", n21, n22);
    printf("#_________________________________________\n");

    kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two);
    double ratio = ((double)n11 / (double)n12) / ((double)n21 / (double)n22);
    
    printf("# p-values for fisher's exact test\n");
    printf("left\tright\ttwo-tail\tratio\n");
    
    /* Some implementations report NAN as negative, some as positive. To ensure
     * we get consistent output from each compiler, we should do our own test.
     * Since the test script assumes "-nan", let's setle on that.
     */
    if(std::isnan(ratio)) {
        printf("%.5g\t%.5g\t%.5g\t-nan\n", left, right, two);
    } else {
        printf("%.5g\t%.5g\t%.5g\t%.3f\n", left, right, two, ratio);
    }
}
Exemple #2
0
int main(int argc, char *argv[])
{
	char id[1024];
	int n11, n12, n21, n22;
	double left, right, twotail, prob;

	while (scanf("%s%d%d%d%d", id, &n11, &n12, &n21, &n22) == 5) {
		prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail);
		printf("%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", id, n11, n12, n21, n22,
				prob, left, right, twotail);
	}
	return 0;
}
Exemple #3
0
static int test16_core(float anno[16], anno16_t *a)
{
    extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
    double left, right;
    int i;
    a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
    for (i=0; i<4; i++) a->d[i] = anno[i];
    a->depth = anno[0] + anno[1] + anno[2] + anno[3];
    a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0);
    if (a->depth == 0) return -1;
    a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499);
    kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]);
    for (i = 1; i < 4; ++i)
        a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i);
    return 0;
}
Exemple #4
0
bool Fisher::calculate() {

	if (!getFisher()) {
		return false;
	}

	// header
	cout << "# Contingency Table" << endl;

    // for fisher's exact test, we need the contingency table
    // XXXXXXXX | not in A        | in A
    // not in B | n11: in neither | n12: only in A
    // in B     | n21: only in B  | n22: in A & B
    //
    double left, right, two;

    long long genomeSize = _context->getGenomeFile()->getGenomeSize();
    // bases covered by neither a nor b
    long long n11 = genomeSize - _queryLen - _dbLen + _intersectionVal;
    // bases covered only by -a
    long long n12 = _queryLen - _intersectionVal;
    // bases covered only by -b
    long long n21 = _dbLen - _intersectionVal;
    // bases covered by both
    long long n22 = _intersectionVal;

    printf("#_________________________________________\n");
    printf("#           | %-12s | %-12s |\n", "not in -b", "in -b");
    printf("# not in -a | %-12lld | %-12lld |\n", n11, n12);
    printf("#     in -a | %-12lld | %-12lld |\n", n21, n22);
    printf("#_________________________________________\n");

    kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two);
    double ratio = ((double)n11 / (double)n12) / ((double)n21 / (double)n22);

    printf("# p-values for fisher's exact test\n");
    printf("left\tright\ttwo-tail\tratio\n");
    printf("%.5f\t%.5f\t%.5f\t%.3f\n", left, right, two, ratio);
    
    //kt_fisher_exact(50010000, 10000000, 15000000, 3000000, &left, &right, &two);

	return true;
}
double sb3(int cnts[4]){

  double left,right,twotail,prob;
  kt_fisher_exact(cnts[0], cnts[1], cnts[2], cnts[3], &left, &right, &twotail);
  return twotail;
}
Exemple #6
0
void analysis(dat &d,int nThreads,int nJack) {
  int *rowSum = new int[d.cn.size()];
  int *rowMax = new int[d.cn.size()];
  int *rowMaxW = new int[d.cn.size()];
  int *error1 = new int[d.cn.size()];//number of non most frequent observed bases 
  int *error2 = new int[d.cn.size()];//sampled 
  size_t mat1[4]={0,0,0,0};//matrix used for fisher for method 1
  size_t mat2[4]={0,0,0,0};//matrix used for fisher for method 2
  size_t tab[2] = {0,0};//used for debug

  for(int i=0;i<d.cn.size();i++) {
    int s =d.cn[i][0];
    int max=s;
    int which=0;
    for(int j=1;j<4;j++){
      s += d.cn[i][j];
      if(d.cn[i][j]>max){
	max=d.cn[i][j];
	which=j;
      }
    }
    rowSum[i] = s;
    rowMax[i]=max;
    rowMaxW[i]=which;
    aMap::iterator it= d.myMap.find(d.pos[i]);
    if(it!=d.myMap.end()){//if site is hapmap site
      // fprintf(stderr,"posi:%d wmax:%d all1:%d freq:%f\n",it->first,rowMaxW[i],it->second.allele1,it->second.freq);
      //if maximum occuring bases is the same as allele1 from hapmap, then set freq to 1-freq
      if(rowMaxW[i]==it->second.allele1)
	//it->first C++ syntax for getting key of iterator
	//it->second C++ syntax for getting value of key of iterator, key->value: key=pos,value=hapSite
 	it->second.freq=1-it->second.freq;
      else
	it->second.freq=it->second.freq;
      // fprintf(stderr,"posi:%d wmax:%d all1:%d freq:%f\n",it->first,rowMaxW[i],it->second.allele1,it->second.freq);
      // exit(0);
    }

    error1[i] = rowSum[i]-rowMax[i];
    error2[i] = simrbinom((1.0*error1[i])/(1.0*rowSum[i]));
    //  fprintf(stdout,"simrbinom\t%d %d %d %d %d %d\n",rowSum[i],rowMax[i],rowMaxW[i],error1[i],error2[i],d.dist[i]);
    if(error1[i]>0)
      tab[1]++;
    else
      tab[0]++;
    if(d.dist[i]==0){//this is a snpsite
      mat1[0] +=error1[i];
      mat1[1] +=rowSum[i]-error1[i];
      mat2[0] +=error2[i];
      mat2[1] +=1-error2[i];
      // fprintf(stdout,"rs %d %d %d %d %d %d %d %f %d %d\n",d.pos[i],rowSum[i],rowMax[i],rowMaxW[i],error1[i],error2[i],d.dist[i],it->second.freq,it->second.allele1,it->second.allele2);
    }else{
      
      mat1[2] +=error1[i];
      mat1[3] +=rowSum[i]-error1[i];
      mat2[2] += error2[i];
      mat2[3] += 1-error2[i];
    }
  }
#if 0
  fprintf(stderr,"tab:%lu %lu\n",tab[0],tab[1]);
  fprintf(stderr,"mat: %lu %lu %lu %lu\n",mat1[0],mat1[1],mat1[2],mat1[3]);
  fprintf(stderr,"mat2: %lu %lu %lu %lu\n",mat2[0],mat2[1],mat2[2],mat2[3]);
#endif
  int n11, n12, n21, n22;
  double left, right, twotail, prob;
  //  fprintf(stderr,"--------\nMAIN RESULTS: Fisher exact test:\n");
  n11=mat1[0];n12=mat1[2];n21=mat1[1];n22=mat1[3];
  prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail);
  //  fprintf(stdout,"Method\t n11 n12 n21 n22 prob left right twotail\n");
  //fprintf(stdout,"%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", "method1", n11, n12, n21, n22,
  //		prob, left, right, twotail);

  n11=mat2[0];n12=mat2[2];n21=mat2[1];n22=mat2[3];
  prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail);
//fprintf(stdout,"%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", "method2", n11, n12, n21, n22,
  //			prob, left, right, twotail);

  //estimate how much contamination
  double c= mat1[2]/(1.0*(mat1[2]+mat1[3]));//this is error for flanking site
  double err= mat1[0]/(1.0*(mat1[0]+mat1[1]));//this is error for snpsite
  fprintf(stderr,"Mismatch_rate_for_flanking:%f MisMatch_rate_for_snpsite:%f \n",c,err);

  int *err0 =new int[d.cn.size()/9];//<-nbases of non frequent most occuring at snpsite
  int *err1 =new int[d.cn.size()/9];//<-nbases of non frequent most occuring at flanking
  int *d0 =new int[d.cn.size()/9];//<-seqdepth for snpsite
  int *d1 =new int[d.cn.size()/9];//<-seqdepth for flanking
  double *freq =new double[d.cn.size()/9];//<- freq for snpsite

  for(int i=0;i<d.cn.size()/9;i++){
    int adj=0;
    int dep=0;
    for(int j=0;j<9;j++){
      
      if(d.dist[i*9+j]!=0){//<- flanking
	adj += error1[i*9+j];
	dep += rowSum[i*9+j];
      }else{ //snpsite
	err0[i] = error1[i*9+j];
	d0[i] = rowSum[i*9+j];
	
	freq[i] =d.myMap.find(d.pos[i*9+j])->second.freq;
	
      }
    }
    err1[i] =adj;
    d1[i] = dep;
#if 0
    if(it==d.myMap.end()){
      fprintf(stderr,"Problem finding:%d\n",d.pos[i]);
      exit(0);
    }
#endif
    //    fprintf(stdout,"cont\t%d\t%d\t%d\t%d\t%f\n",err0[i],err1[i],d0[i],d1[i],freq[i]);
    
  }


  allPars ap;
  ap.len=d.cn.size()/9;
  ap.seqDepth = d0;
  ap.nonMajor = err0;
  ap.freq = freq;
  ap.eps = c;
  ap.newllh =0;
  ap.e1 = err1;
  ap.d1=d1;

  double mom,momJack,ML,mlJack,val;

  ap.newllh =0;
  mom= likeOldMom(d.cn.size()/9,d0,err0,freq,c,-1);
  momJack = jackMom(&ap,nJack);
  tpars tp;tp.ap=&ap;tp.skip=-1;
  //  print(tp.ap,"asdff1");
  kmin_brent(myfun,1e-6,0.5-1e-6,&tp,0.0001,&ML);
  mlJack= jackML(&ap,nThreads,NULL,nJack);
  fprintf(stderr,"\nMethod1: old_llh Version: MoM:%f SE(MoM):%e ML:%f SE(ML):%e",mom,momJack,ML,mlJack);
  
  ap.newllh =1;
  mom=likeNewMom(d.cn.size()/9,d0,err0,freq,c,-1);
  momJack= jackMom(&ap,nJack);
  //marshall(&ap,"prem1");
  val=kmin_brent(myfun,1e-6,0.5-1e-6,&tp,0.0001,&ML);
  //  fprintf(stderr,"\nM1: ML:%f VAL:%f\n",ML,val);
  mlJack= jackML(&ap,nThreads,NULL,nJack);
  fprintf(stderr,"\nMethod1: new_llh Version: MoM:%f SE(MoM):%e ML:%f SE(ML):%e",mom,momJack,ML,mlJack);
  // fread(error2,sizeof(int),d.cn.size(),fopen("error2.bin","rb"));
  //for(int i=0;0&&i<d.cn.size();i++)
  //  fprintf(stdout,"pik\t%d\n",error2[i]);
  //exit(0);
  for(int i=0;i<d.cn.size()/9;i++){
    int adj=0;
    for(int j=0;j<9;j++){
      if(d.dist[i*9+j]!=0){
	adj += error2[i*9+j];
      }else{
	err0[i] = error2[i*9+j];
	freq[i] =d.myMap.find(d.pos[i*9+j])->second.freq;
	//	fprintf(stderr,"freq:%f\n",freq[i]);
      }
    }
    err1[i] =adj;
    d0[i] = 1; 
    d1[i] = 8;
#if 0
    if(it==d.myMap.end()){
      fprintf(stderr,"Problem finding:%d\n",d.pos[i]);
      exit(0);
    }
#endif
    //    fprintf(stdout,"cont\t%d\t%d\t%d\t%d\t%f\n",err0[i],err1[i],d0[i],d1[i],freq[i]);
    
  }

  ap.seqDepth = d0;
  ap.nonMajor = err0;
  ap.e1=err1;
  ap.d1=d1;
  ap.freq = freq;

  ap.newllh =0;
  mom= likeOldMom(d.cn.size()/9,d0,err0,freq,c,-1);
  momJack = jackMom(&ap,nJack);
  //print(tp.ap,"asdff2");
  //  exit(0);
  val = kmin_brent(myfun,1e-6,0.5-1e-6,&tp,0.0001,&ML);
  //fprintf(stderr,"\nML2:%f VAL:%f\n",ML,val);
  // exit(0);
  //FILE *fp = fopen("heyaa","w");  print(&ap,fp);fclose(fp);
  //return;
  mlJack= jackML(&ap,nThreads,NULL,nJack);
  fprintf(stderr,"\nMethod2: old_llh Version: MoM:%f SE(MoM):%e ML:%f SE(ML):%e",mom,momJack,ML,mlJack);

  ap.newllh =1;
  mom=likeNewMom(d.cn.size()/9,d0,err0,freq,c,-1);
  momJack= jackMom(&ap,nJack);
  kmin_brent(myfun,1e-6,0.5-1e-6,&tp,0.0001,&ML);
  mlJack= jackML(&ap,nThreads,NULL,nJack);
  fprintf(stderr,"\nMethod2: new_llh Version: MoM:%f SE(MoM):%e ML:%f SE(ML):%e\n",mom,momJack,ML,mlJack);
 

  delete [] rowSum;
  delete [] rowMax;
  delete [] rowMaxW;
  delete [] error1;
  delete [] error2;
  

  delete [] err0;
  delete [] err1;
  delete [] d0;
  delete [] d1;
  delete [] freq;



}
Exemple #7
0
int main(int argc, char **argv)
{
    WAH_SIZE = 32;
    WAH_MAX_FILL_WORDS = (1<<(WAH_SIZE-1)) - 1;

    uint32_t num_chrms = 100;

    if ((argc != 4)) {
        errx(1,
             "usage:\t%s <input file> <index dir> <w|i>",
             argv[0]);
    }

    double genome_size =  3095677412.0;

    char *input_file = argv[1];
    char *index_dir = argv[2];
    char *i_type = argv[3];

    struct input_file *in_f = input_file_init(input_file);

    int chrm_len = 50;
    char *chrm = (char *)malloc(chrm_len*sizeof(char));
    uint32_t start, end;
    long offset;

    struct giggle_index *gi;

    gi = giggle_load(index_dir,
                     uint32_t_ll_giggle_set_data_handler);

    uint32_t *file_counts = (uint32_t *)
            calloc(gi->file_index->num, sizeof(uint32_t));

    uint32_t num_intervals = 0;
    double mean_interval_size = 0.0;
    while ( in_f->input_file_get_next_interval(in_f, 
                                               &chrm,
                                               &chrm_len,
                                               &start,
                                               &end,
                                               &offset) >= 0 ) {
        num_intervals += 1;
        mean_interval_size += end - start;

        struct uint32_t_ll *R =
                (struct uint32_t_ll *)giggle_query_region(gi,
                                                          chrm,
                                                          start,
                                                          end);
        if (R != NULL) {
            struct uint32_t_ll_node *curr = R->head;

            while (curr != NULL) {
                /*
                struct file_id_offset_pair *fid_off = 
                    (struct file_id_offset_pair *)
                    unordered_list_get(gi->offset_index, curr->val);
                */
                struct file_id_offset_pair fid_off = 
                    gi->offset_index->vals[curr->val];
                struct file_data *fd = 
                    (struct file_data *)
                    unordered_list_get(gi->file_index, fid_off.file_id);

                file_counts[fid_off.file_id] += 1;

                curr = curr->next;
            }
            uint32_t_ll_free((void **)&R);
        }
    }

    mean_interval_size = mean_interval_size/num_intervals;

    struct doubles_uint32_t_tuple *sig = (struct doubles_uint32_t_tuple *)
        calloc(gi->file_index->num, sizeof(struct doubles_uint32_t_tuple));

    uint32_t i;
    for (i = 0; i < gi->file_index->num; ++i) {
        struct file_data *fd = 
            (struct file_data *)
            unordered_list_get(gi->file_index, i);

        long long n11 = (long long)(file_counts[i]);
        long long n12 = (long long)(MAX(0,num_intervals - file_counts[i]));
        long long n21 = (long long)(MAX(0,fd->num_intervals - file_counts[i]));
        double comp_mean = ((fd->mean_interval_size+mean_interval_size));
        long long n22_full = (long long)
            MAX(n11 + n12 + n21, genome_size/comp_mean);
        long long n22 = MAX(0, n22_full - (n11 + n12 + n21));
        double left, right, two;
        double r = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two);

        double ratio = (((double)n11/(double)n12) / ((double)n21/(double)n22));

        //fprintf(stderr, "%s\t%f\n", fd->file_name, two);
        sig[i].d1 = right;
        sig[i].d2 = ratio;
        sig[i].u1 = i;
        sig[i].u2 = file_counts[i];
    }

    qsort(sig,
          gi->file_index->num,
          sizeof(struct doubles_uint32_t_tuple), 
          doubles_uint32_t_tuple_cmp);

    for (i = 0; i < gi->file_index->num; ++i) {
        struct file_data *fd = 
            (struct file_data *)
            unordered_list_get(gi->file_index, sig[i].u1);
        /*
        printf("%s\t"
               "right:%f\t"
               "%f\n", fd->file_name, sig[i].d1, sig[i].d2);
        */
        printf( "sig:%f\t"
                "size:%u\t"
                "overlap:%u\t"
                "ratio:%f\t"
                "%s\n",
                sig[i].d1,
                fd->num_intervals,
                sig[i].u2,
                sig[i].d2,
                fd->file_name);
    }

    giggle_index_destroy(&gi);
    cache.destroy();
}