예제 #1
0
int oneBin
(int *profile_chromStart, 
 int *profile_chromEnd, 
 int *profile_coverage, 
 int n_profiles,
 int *bin_total,
 int bin_chromStart,
 int bin_chromEnd){
  return binSum(
    profile_chromStart,
    profile_chromEnd,
    profile_coverage,
    n_profiles,
    bin_total,
    bin_chromEnd - bin_chromStart,
    1,
    bin_chromStart,
    EMPTY_AS_ZERO);
}
예제 #2
0
int
binSumLR
(int *data_start_end,
 int *chromStart, int *chromEnd,
 int *coverage, int n_entries,
 int *left_bin_vec, int *right_bin_vec,
 int left_chromStart, int right_chromStart,
 int bases_per_bin, int n_bins){
  int bin_chromEnd, bin_chromStart;
  int extra_chromStart, extra_chromEnd, extra_bases, extra_coverage;
  int status;
  /* printf("left bin_size=%d bins=%d start=%d\n",  */
  /* 	 bases_per_bin, n_bins, left_chromStart); */
  status = binSum(chromStart, chromEnd,
		  coverage, n_entries,
		  left_bin_vec,
		  bases_per_bin,
		  n_bins,
		  left_chromStart,
		  EMPTY_AS_ZERO);
  if(status != 0){
    return status;
  }
  /* printf("right bin_size=%d bins=%d start=%d\n",  */
  /* 	 bases_per_bin, n_bins, right_chromStart); */
  status = binSum(chromStart, chromEnd,
		  coverage, n_entries,
		  right_bin_vec,
		  bases_per_bin,
		  n_bins,
		  right_chromStart,
		  EMPTY_AS_ZERO);
  if(status != 0){
    return status;
  }
  for(int bin_i=0; bin_i < n_bins; bin_i++){
    //left bin.
    bin_chromStart = left_chromStart + bases_per_bin * bin_i;
    bin_chromEnd = bin_chromStart + bases_per_bin;
    if(data_start_end[0] < bin_chromEnd){
      if(data_start_end[0] <= bin_chromStart){
	//   (     data   ]
	//   (bin]
	//       (bin]
	// bin is completely data, leave it alone!
      }else{
	//    (  data  ]
	//   (bin]
	//    - extra
	// (bin]
	//  --- extra
	// bin has some data, so subtract the extra.
	extra_chromStart = bin_chromStart;
	extra_chromEnd = data_start_end[0];
	extra_bases = extra_chromEnd - extra_chromStart;
	//printf("left start=%d bases=%d\n", extra_chromStart, extra_bases);
	status = binSum(chromStart, chromEnd,
			coverage, n_entries,
			&extra_coverage,
			extra_bases,
			1,
			extra_chromStart,
			EMPTY_AS_ZERO);
	if(status != 0){
	  return status;
	}
	left_bin_vec[bin_i] -= extra_coverage;
      }
    }else{
      //      (  data  ]
      //  (bin]
      // bin does not overlap data, so set it to zero.
      left_bin_vec[bin_i] = 0;
    }
    //right bin.
    bin_chromStart = right_chromStart + bases_per_bin * bin_i;
    bin_chromEnd = bin_chromStart + bases_per_bin;
    if(bin_chromStart < data_start_end[1]){
      if(bin_chromEnd <= data_start_end[1]){
	// (  data  ]
	//      (bin]    
	//  (bin]
	// bin is completely data, leave it alone!
      }else{
	// (  data  ]
	//        (bin]
	//           -- extra
	extra_chromStart = data_start_end[1];
	extra_chromEnd = bin_chromEnd;
	extra_bases = extra_chromEnd - extra_chromStart;
	//printf("right start=%d bases=%d\n", extra_chromStart, extra_bases);
	status = binSum(chromStart, chromEnd,
			coverage, n_entries,
			&extra_coverage,
			extra_bases,
			1,
			extra_chromStart,
			EMPTY_AS_ZERO);
	if(status != 0){
	  return status;
	}
	right_bin_vec[bin_i] -= extra_coverage;
      }
    }else{
      // (  data  ]
      //          (bin]
      //              (bin]
      // bin does not overlap data, so set it to zero.
      right_bin_vec[bin_i] = 0;
    }
  }
  return 0;
}
예제 #3
0
int PeakSegJointFaster(
  struct ProfileList *profile_list,
  int bin_factor,
  double *mean_mat,
  double *flat_loss_vec,
  double *peak_loss_vec,
  int *peak_start_end,
  int *data_start_end
  ){
  int n_samples = profile_list->n_profiles;
  if(n_samples == 0){
    return ERROR_FASTER_NO_COVERAGE_DATA;
  }
  int chromStart, chromEnd, unfilled_chromStart, unfilled_chromEnd;
  double seg1_mean=-1, seg2_mean, seg3_mean;
  struct Profile *profile, *samples = profile_list->profile_vec;
  profile = samples;
  unfilled_chromEnd = get_max_chromEnd(profile);
  unfilled_chromStart = get_min_chromStart(profile);
  for(int sample_i=1; sample_i < n_samples; sample_i++){
    profile = samples + sample_i;
    chromStart = get_min_chromStart(profile);
    if(chromStart < unfilled_chromStart){
      unfilled_chromStart = chromStart;
    }
    chromEnd = get_max_chromEnd(profile);
    if(unfilled_chromEnd < chromEnd){
      unfilled_chromEnd = chromEnd;
    }
  }
  data_start_end[0] = unfilled_chromStart;
  data_start_end[1] = unfilled_chromEnd;
  int unfilled_bases = unfilled_chromEnd - unfilled_chromStart;
  double data_bases = (double) unfilled_bases;
  double bin_bases;
  if(unfilled_bases/bin_factor < 4){
    /*
      4 is smallest the number of data points for which the 3-segment
      optimization problem is not trivial.

      If we don't have at least this many data points for the first
      bin step, than we stop with an error.
    */
    return ERROR_FASTER_BIN_FACTOR_TOO_LARGE;
  }
  int bases_per_bin = 1;
  while(unfilled_bases/bases_per_bin/bin_factor >= 4){
    bases_per_bin *= bin_factor;
  }
  int n_bins = unfilled_bases / bases_per_bin;
  /*
    MaxBinSize() from line 1 of the JointZoom algorithm of the
    PeakSegJoint paper returns the value of the C variable
    bases_per_bin.

    Little b in the text of the section that describes the JointZoom
    algorithm is the C variable n_bins.
  */
  if(unfilled_bases % bases_per_bin != 0){
    n_bins ++ ;
  }
  if(n_bins < bin_factor*2){
    n_bins = bin_factor*2;
  }
  int extra_bases = n_bins  * bases_per_bin - unfilled_bases;
  int extra_before = extra_bases/2;
  int extra_after = extra_bases - extra_before;
  //Rprintf("bin_factor=%d n_bins=%d bases_per_bin=%d extra_before=%d extra_after=%d\n", bin_factor, n_bins, bases_per_bin, extra_before, extra_after);
  //int extra_count;
  int seg1_chromStart = unfilled_chromStart - extra_before;
  int seg3_chromEnd = unfilled_chromEnd + extra_after;

  int *sample_count_mat = (int*) malloc(n_bins * n_samples * sizeof(int));
  double *last_cumsum_vec = (double*) malloc(n_samples*sizeof(double));
  int *sample_cumsum_mat = (int*) malloc(n_bins * n_samples * sizeof(int));

  double *seg1_mean_vec = (double*)malloc(sizeof(double)*n_samples);
  double *seg2_mean_vec = (double*)malloc(sizeof(double)*n_samples);
  double *seg3_mean_vec = (double*)malloc(sizeof(double)*n_samples);
  
  double *seg1_loss_vec = (double*)malloc(sizeof(double)*n_samples);
  double *candidate_loss_vec = (double*)malloc(sizeof(double)*n_samples);

  int *left_bin_vec = (int*) malloc(n_bins * sizeof(int));
  int *right_bin_vec = (int*) malloc(n_bins * sizeof(int));
  int *left_cumsum_mat = (int*) malloc(n_bins * n_samples * sizeof(int));
  int *right_cumsum_mat = (int*) malloc(n_bins * n_samples * sizeof(int));
  int *left_initial_cumsum_vec = (int*) malloc(n_samples * sizeof(int));
  int *right_initial_cumsum_vec = (int*) malloc(n_samples * sizeof(int));
  
  int *left_cumsum_vec, *right_cumsum_vec;
  int left_cumsum_value, right_cumsum_value;
  int peakStart, peakEnd;
  int best_seg1, best_seg2;

  int *count_vec, *cumsum_vec, cumsum_value;
  int status;
  for(int sample_i=0; sample_i < n_samples; sample_i++){
    profile = samples + sample_i;
    count_vec = sample_count_mat + n_bins*sample_i;
    status = binSum(profile->chromStart, profile->chromEnd,
		    profile->coverage, profile->n_entries,
		    count_vec,
		    bases_per_bin, n_bins, seg1_chromStart, 
		    EMPTY_AS_ZERO);
    if(status != 0){
      free(sample_count_mat);
      free(last_cumsum_vec);
      free(sample_cumsum_mat);
  
      free(seg1_mean_vec);
      free(seg2_mean_vec);
      free(seg3_mean_vec);

      free(seg1_loss_vec);
      free(candidate_loss_vec);

      free(left_bin_vec);
      free(right_bin_vec);
      free(left_cumsum_mat);
      free(right_cumsum_mat);
      free(left_initial_cumsum_vec);
      free(right_initial_cumsum_vec);
      return status;
    }
  }//for sample_i
  int bin_i, offset;
  for(int sample_i=0; sample_i < n_samples; sample_i++){
    cumsum_value = 0;
    offset = n_bins * sample_i;
    count_vec = sample_count_mat + offset;
    cumsum_vec = sample_cumsum_mat + offset;
    //printf("[sample%02d] ", sample_i);
    for(bin_i=0; bin_i < n_bins; bin_i++){
      cumsum_value += count_vec[bin_i];
      //printf("%d ", cumsum_value);
      cumsum_vec[bin_i] = cumsum_value;
    }
    last_cumsum_vec[sample_i] = cumsum_value;
    seg1_mean = cumsum_value / data_bases;
    flat_loss_vec[sample_i] = OptimalPoissonLoss(cumsum_value, seg1_mean);
  }
  /*
    The for loops below implement the GridSearch() function mentioned
    on line 2 of the JointZoom algorithm in the PeakSegJoint paper.
  */
  double best_loss = INFINITY, feasible_loss;
  for(int seg1_LastIndex=0; seg1_LastIndex < n_bins-2; seg1_LastIndex++){
    peakStart = seg1_chromStart + (seg1_LastIndex+1)*bases_per_bin;
    if(unfilled_chromStart < peakStart){
      for(int sample_i=0; sample_i < n_samples; sample_i++){
	cumsum_vec = sample_cumsum_mat + n_bins*sample_i;
	cumsum_value = cumsum_vec[seg1_LastIndex];
	bin_bases = (seg1_LastIndex+1)*bases_per_bin;
	data_bases = bin_bases - (double)extra_before;
	/* printf("sample_i=%d extra_before=%d bin_bases=%f data_bases=%f\n", */
	/* 	     sample_i, extra_before, bin_bases, data_bases); */
	seg1_mean = cumsum_value/data_bases;
	seg1_mean_vec[sample_i] = seg1_mean;
	seg1_loss_vec[sample_i] = OptimalPoissonLoss(cumsum_value, seg1_mean);
      }
      for(int seg2_LastIndex=seg1_LastIndex+1; 
	  seg2_LastIndex < n_bins-1; 
	  seg2_LastIndex++){
	feasible_loss=0.0;
	peakEnd = seg1_chromStart + (seg2_LastIndex+1)*bases_per_bin;
	if(peakEnd < unfilled_chromEnd){
	  for(int sample_i=0; sample_i < n_samples; sample_i++){
	    candidate_loss_vec[sample_i] = seg1_loss_vec[sample_i];
	    cumsum_vec = sample_cumsum_mat + n_bins*sample_i;
	    //segment 2.
	    cumsum_value = cumsum_vec[seg2_LastIndex]-cumsum_vec[seg1_LastIndex];
	    data_bases = (seg2_LastIndex-seg1_LastIndex)*bases_per_bin;
	    seg2_mean = cumsum_value/data_bases;
	    seg2_mean_vec[sample_i] = seg2_mean;
	    candidate_loss_vec[sample_i] += OptimalPoissonLoss(cumsum_value, seg2_mean);
	    //segment 3.
	    cumsum_value = cumsum_vec[n_bins-1]-cumsum_vec[seg2_LastIndex];
	    bin_bases = (n_bins-1-seg2_LastIndex)*bases_per_bin;
	    data_bases = bin_bases - extra_after;
	    seg3_mean = cumsum_value/data_bases;
	    seg3_mean_vec[sample_i] = seg3_mean;
	    candidate_loss_vec[sample_i] += OptimalPoissonLoss(cumsum_value, seg3_mean);
	    if(seg1_mean < seg2_mean && seg2_mean > seg3_mean){
	      feasible_loss += candidate_loss_vec[sample_i];
	    }else{
	      feasible_loss += flat_loss_vec[sample_i];
	    }
	  }//sample_i
	  if(feasible_loss < best_loss){
	    best_loss = feasible_loss;
	    peak_start_end[0] = peakStart;
	    peak_start_end[1] = peakEnd;
	    for(int sample_i=0; sample_i < n_samples; sample_i++){
	      peak_loss_vec[sample_i] = candidate_loss_vec[sample_i];
	      mean_mat[sample_i] = seg1_mean_vec[sample_i];
	      mean_mat[sample_i+n_samples] = seg2_mean_vec[sample_i];
	      mean_mat[sample_i+n_samples*2] = seg3_mean_vec[sample_i];
	      // Also save the left/right cumsums, which are needed to
	      // perform the step2 optimization.
	      cumsum_vec = sample_cumsum_mat + n_bins*sample_i;
	      if(seg1_LastIndex == 0){
		left_initial_cumsum_vec[sample_i] = 0;
	      }else{
		left_initial_cumsum_vec[sample_i] = cumsum_vec[seg1_LastIndex-1];
	      }
	      right_initial_cumsum_vec[sample_i] = cumsum_vec[seg2_LastIndex-1];
	    }//for(sample_i
	  }//if(feasible_loss < best_loss
	}//if(peakEnd < unfilled_chromEnd
      }//for(seg2_LastIndex
    }//if(unfilled_chromStart < peakStart
  }//for(seg1_LastIndex
  //return status;//old end of Step1.
  /* When performing the minimization over peakStart/End locations, it
   * is possible that at any given bases_per_bin value, there is no
   * better solution than what we found for the previous bases_per_bin
   * value. In that case, we begin the search anew at a lower
   * resolution, but we need to copy the cumsums from the following
   * index of left_right_vec: */
  int left_chromStart, right_chromStart;
  /*
    The while loop below corresponds to line 3 of the JointZoom
    algorithm from the PeakSegJoint paper.
  */
  while(1 < bases_per_bin){
    best_seg1 = -1; // indicates no min found.
    left_chromStart = peak_start_end[0] - bases_per_bin;
    right_chromStart = peak_start_end[1] - bases_per_bin;
    /*
      Below in the C code we decrease the value of bases_per_bin,
      as in line 4 of the JointZoom algorithm in the PeakSegJoint
      paper.
    */
    bases_per_bin /= bin_factor;
    //printf("bases_per_bin=%d left cumsum before:\n", bases_per_bin);
    for(int sample_i=0; sample_i < n_samples; sample_i++){
      profile = profile_list->profile_vec + sample_i;
      //printf("binSumLR sample_i=%d\n", sample_i);
      status = binSumLR(data_start_end,
			profile->chromStart, profile->chromEnd,
			profile->coverage, profile->n_entries,
			left_bin_vec, right_bin_vec,
			left_chromStart, right_chromStart,
			bases_per_bin, n_bins);
      if(status != 0){
	//printf("binSumLR bad status\n");
	free(sample_count_mat);
	free(last_cumsum_vec);
	free(sample_cumsum_mat);
  
	free(seg1_mean_vec);
	free(seg2_mean_vec);
	free(seg3_mean_vec);

	free(seg1_loss_vec);
	free(candidate_loss_vec);

	free(left_bin_vec);
	free(right_bin_vec);
	free(left_cumsum_mat);
	free(right_cumsum_mat);
	free(left_initial_cumsum_vec);
	free(right_initial_cumsum_vec);
	return status;
      }
      left_cumsum_vec = left_cumsum_mat + n_bins*sample_i;
      left_cumsum_value = left_initial_cumsum_vec[sample_i];
      right_cumsum_vec = right_cumsum_mat + n_bins*sample_i;
      right_cumsum_value = right_initial_cumsum_vec[sample_i];
      //printf("%d ", left_cumsum_value);
      //printf("%d ", right_cumsum_value);
      for(int bin_i=0; bin_i < n_bins; bin_i++){
	left_cumsum_value += left_bin_vec[bin_i];
	left_cumsum_vec[bin_i] = left_cumsum_value;
	right_cumsum_value += right_bin_vec[bin_i];
	right_cumsum_vec[bin_i] = right_cumsum_value;
      }
    }//for(sample_i
    /* 
       cumsum matrices have been computed, so now use them to
       compute the loss and feasibility of all models.

       The for loops below correspond to SearchNearPeak() on line
       5 of the JointZoom algorithm in the PeakSegJoint paper.
    */
    for(int seg1_LastIndex=0; seg1_LastIndex < n_bins; seg1_LastIndex++){
      peakStart = left_chromStart + (seg1_LastIndex+1)*bases_per_bin;
      if(unfilled_chromStart < peakStart){
	//printf("[seg1last=%d] seg1 cumsum bases ", seg1_LastIndex);
	for(int sample_i=0; sample_i < n_samples; sample_i++){
	  left_cumsum_vec = left_cumsum_mat + n_bins*sample_i;
	  cumsum_value = left_cumsum_vec[seg1_LastIndex];
	  bin_bases = peakStart - seg1_chromStart;
	  data_bases = bin_bases - extra_before;
	  seg1_mean = cumsum_value/data_bases;
	  //printf("%d %f ", cumsum_value, bases_value);
	  seg1_mean_vec[sample_i] = seg1_mean;
	  seg1_loss_vec[sample_i] = OptimalPoissonLoss(cumsum_value, seg1_mean);
	}
	//printf("\n");
	for(int seg2_LastIndex=0; seg2_LastIndex < n_bins; seg2_LastIndex++){
	  peakEnd = right_chromStart + (seg2_LastIndex+1)*bases_per_bin;
	  if(peakStart < peakEnd && peakEnd < unfilled_chromEnd){
	    feasible_loss = 0.0;
	    //printf("[seg2last=%d]\n", seg2_LastIndex);
	    for(int sample_i=0; sample_i < n_samples; sample_i++){
	      left_cumsum_vec = left_cumsum_mat + n_bins*sample_i;
	      right_cumsum_vec = right_cumsum_mat + n_bins*sample_i;
	      //segment 1.
	      candidate_loss_vec[sample_i] = seg1_loss_vec[sample_i];
	      //segment 2.
	      cumsum_value = 
		right_cumsum_vec[seg2_LastIndex]-
		left_cumsum_vec[seg1_LastIndex];
	      data_bases = peakEnd-peakStart;
	      seg2_mean = cumsum_value/data_bases;
	      seg2_mean_vec[sample_i] = seg2_mean;
	      candidate_loss_vec[sample_i] += OptimalPoissonLoss(cumsum_value, seg2_mean);
	      //segment 3.
	      cumsum_value = 
		last_cumsum_vec[sample_i]-
		right_cumsum_vec[seg2_LastIndex];
	      bin_bases = seg3_chromEnd - peakEnd;
	      data_bases = bin_bases - extra_after;
	      seg3_mean = cumsum_value/data_bases;
	      seg3_mean_vec[sample_i] = seg3_mean;
	      candidate_loss_vec[sample_i] += OptimalPoissonLoss(cumsum_value, seg3_mean);
	      if(seg1_mean < seg2_mean && seg2_mean > seg3_mean){
		feasible_loss += candidate_loss_vec[sample_i];
	      }else{
		feasible_loss += flat_loss_vec[sample_i];
	      }
	    }//for(sample_i
	    if(feasible_loss < best_loss){
	      best_loss = feasible_loss;
	      peak_start_end[0] = peakStart;
	      peak_start_end[1] = peakEnd;
	      best_seg1 = seg1_LastIndex;
	      best_seg2 = seg2_LastIndex;
	      for(int sample_i=0; sample_i < n_samples; sample_i++){
		peak_loss_vec[sample_i] = candidate_loss_vec[sample_i];
		mean_mat[sample_i] = seg1_mean_vec[sample_i];
		mean_mat[sample_i+n_samples] = seg2_mean_vec[sample_i];
		mean_mat[sample_i+n_samples*2] = seg3_mean_vec[sample_i];
	      }
	    }//if(feasible_loss<best_loss
	  }//if(peakStart<peakEnd
	}//for(seg2_LastIndex
      }//if(unfilled_chromStart < peakStart
    }//for(seg1_LastIndex
    for(int sample_i=0; sample_i < n_samples; sample_i++){
      left_cumsum_vec = left_cumsum_mat + n_bins*sample_i;
      right_cumsum_vec = right_cumsum_mat + n_bins*sample_i;
      if(best_seg1 == -1){
	/* the bin_factor is the number of new bins that get put into old bins.
	   [ ] old bin   
	   _ _ new bins
	   bin_factor=2 [ _ _ ]   peakStart [ _ _ ]
	   bin_factor=3 [ _ _ _ ] peakStart [ _ _ _ ]
     
	   We always start the search on the new level from the old bin just
	   before the peakStart/End. Therefore if no new min is found, the
	   best choice is the old one, seg1_LastIndex=bin_factor-1 */
	best_seg1 = bin_factor-1;
	best_seg2 = bin_factor-1;
	//Rprintf("no new min found! best_seg1=%d best_seg2=%d\n", best_seg1, best_seg2);
      }
      if(best_seg1 != 0){
	left_initial_cumsum_vec[sample_i] = left_cumsum_vec[best_seg1-1];
      }
      if(best_seg2 != 0){
	right_initial_cumsum_vec[sample_i] = right_cumsum_vec[best_seg2-1];
      }
    }
  }//while(1 < bases_per_bin)
  //printf("free at end\n");
  free(sample_count_mat);
  free(last_cumsum_vec);
  free(sample_cumsum_mat);
  
  free(seg1_mean_vec);
  free(seg2_mean_vec);
  free(seg3_mean_vec);

  free(seg1_loss_vec);
  free(candidate_loss_vec);

  free(left_bin_vec);
  free(right_bin_vec);
  free(left_cumsum_mat);
  free(right_cumsum_mat);
  free(left_initial_cumsum_vec);
  free(right_initial_cumsum_vec);
  
  return 0;
}