Exemple #1
0
BIN_LIST binning(int *tumor_1bp_bin, int n_tumor, int *normal_1bp_bin, int n_normal,int bin_size)
	{ int i,nrow,max_pos,min_pos;
	  int start,end ,i_tum, i_norm;
	  double tumorcnt, normalcnt,freq,total, N_tumor, N_normal;
	  BIN_LIST bins = ll_new();

	  max_pos = (tumor_1bp_bin[2*(n_tumor-1)]>normal_1bp_bin[2*(n_normal-1)])? tumor_1bp_bin[2*(n_tumor-1)]: normal_1bp_bin[2*(n_normal-1)];
	  min_pos = (tumor_1bp_bin[0]<normal_1bp_bin[0])?tumor_1bp_bin[0]:normal_1bp_bin[0];
	  nrow = (max_pos-min_pos+1)/bin_size+10;

	  N_tumor = 0.0; N_normal = 0.0;
	  i_tum = 0;
	  i_norm = 0;
	  start = min_pos - (min_pos-1)%bin_size;/*the start position of the left most bin*/
	  end = start + bin_size -1;
	  for(i=0;i<nrow;i++)
		{ tumorcnt = 0.0;
		  normalcnt = 0.0;
		  while(tumor_1bp_bin[2*i_tum]<=end&&i_tum<n_tumor)
			{ tumorcnt += tumor_1bp_bin[2*i_tum+1];
			  i_tum++;
			}
		  while(normal_1bp_bin[2*i_norm]<=end&&i_norm<n_normal)
			{ normalcnt += normal_1bp_bin[2*i_norm+1];
			  i_norm++;
			}

		  total = tumorcnt + normalcnt;
		  freq = tumorcnt/total;
		  if(total>0.0) ll_append(bins, bin_new(tumorcnt, total, freq, start, end));
		  N_tumor += tumorcnt;
		  N_normal += normalcnt;

		  start += bin_size;
		  end = start + bin_size -1;
		}

	   set_totalreadcount(N_tumor,N_normal);
	  return bins;
	}
Exemple #2
0
/*
 * A recursive break contig function.
 * bin_num	The current bin being moved or split.
 * pos		The contig break point.
 * offset	The absolute positional offset of this bin in original contig
 * pleft	The parent bin/contig record num in the left new contig
 * pright	The parent bin/contig record num in the right new contig
 * child_no     0 or 1 - whether this bin is the left/right child of its parent
 */
static int break_contig_recurse(GapIO *io, HacheTable *h,
				contig_t *cl, contig_t *cr,
				tg_rec bin_num, int pos, int offset,
				int level, tg_rec pleft, tg_rec pright,
				int child_no, int complement) {
    int i, j, f_a, f_b;
    tg_rec rbin;
    bin_index_t *bin = get_bin(io, bin_num), *bin_dup ;
    //int bin_min, bin_max;
    int nseqs;
    tg_rec opright; /* old pright, needed if we revert back */

    cache_incr(io, bin);

    if (bin->flags & BIN_COMPLEMENTED) {
	complement ^= 1;
    }

    if (complement) {
	f_a = -1;
	f_b = offset + bin->size-1;
    } else {
	f_a = +1;
	f_b = offset;
    }

    printf("%*sBreak offset %d pos %d => test bin %"PRIrec": %d..%d\n",
	   level*4, "",
	   offset, pos, bin->rec,
	   NMIN(bin->start_used, bin->end_used),
	   NMAX(bin->start_used, bin->end_used));

    bin = cache_rw(io, bin);
    nseqs = bin->nseqs;
    bin->nseqs = 0;

    /* Invalidate any cached data */
    bin_invalidate_track(io, bin, TRACK_ALL);
    if (bin->flags & BIN_CONS_VALID) {
	bin->flags |= BIN_BIN_UPDATED;
	bin->flags &= ~BIN_CONS_VALID;
    }

    //bin_min = bin->rng ? NMIN(bin->start_used, bin->end_used) : offset;
    //bin_max = bin->rng ? NMAX(bin->start_used, bin->end_used) : offset;

    /*
     * Add to right parent if this bin is to the right of pos,
     * or if the used portion is to the right and we have no left child.
     *
     * FIXME: Not a valid assumption!
     * The used portion of a bin is not a placeholder for the used portion
     * of all the the children beneath it. Therefore if the used portion of
     * this bin is > pos (and we have no left child) it still doesn't mean
     * that the absolute positions of the used portion of the right child
     * won't be < pos.
     */
    if (offset >= pos /*|| (bin_min >= pos && !bin->child[0])*/) {
	printf("%*sADD_TO_RIGHT pl=%"PRIrec" pr=%"PRIrec"\n",
	       level*4, "", pleft, pright);
	if (0 != break_contig_move_bin(io, bin,
				       cl, pleft, cr, pright, 
				       child_no))
	    return -1;

	bin_incr_nseq(io, bin, nseqs);
	cache_decr(io, bin);

	return 0;
    }

    /*
     * Add to left parent if this bin is entirely to the left of pos,
     * or if the used portion is to the left and we have no right child.
     */
    if (offset + bin->size < pos /*|| (bin_max < pos && !bin->child[1])*/) {
	printf("%*sADD_TO_LEFT\n", level*4, "");

	//if (0 != break_contig_move_bin(io, bin, cr, pright, cl, pleft, child_no))
	//return -1;

	bin_incr_nseq(io, bin, nseqs);
	cache_decr(io, bin);
	
	return 0;
    }

    /*
     * Nominally the bin overlaps both left and right and so needs duplicating.
     * There are cases though at the roots of our trees where duplicating is
     * unnecessary as it leads to empty bins at the root. In this case
     * we skip creating a duplicate for the right, or alternatively steal
     * the left root bin and use that instead.
     *
     * Similarly the range_t array will either be left where it is, moved to
     * the right contig, or split in half (creating a new one for the right).
     *
     * FIXED: always need this. Eg:
     *
     * |-------------empty--------------|
     * |----------------|---------------|
     * |--------|-------|--------|------|
     *             ^
     *             |
     *             break here
     *
     * In this case we need to duplicate the parent as it overlaps the left
     * bin, which may (or may not) have data that needs to end up in the right
     * hand contig. Just duplicate for now and free later on if needed.
     */
    if (1 /* always! */ || pright != cr->rec ||
	(bin->rng && NMAX(bin->start_used, bin->end_used) >= pos)) {
	//printf("NMAX=%d >= %d\n", NMAX(bin->start_used, bin->end_used), pos);

	rbin = 0;

	/* Possibly steal left contig's bin */
	if (pleft == cl->rec && NMIN(bin->start_used, bin->end_used) >= pos) {
#if 0
	    /* Currently this doesn't always work */
	    if (bin->child[1]) {
		bin_index_t *ch = get_bin(io, bin->child[1]);
		if (NMIN(ch->pos, ch->pos + ch->size-1) >= pos) {
		    rbin = cl->bin;
		    cl->bin = bin->child[0];
		}
	    }
#else
	    pleft = bin->rec;
#endif
	} else {
	    pleft = bin->rec;
	}

	/* Create new bin, or use root of contig if it's unused so far */
	if (!rbin && pright == cr->rec) {
	    rbin = cr->bin;
	}

	/* Otherwise we genuingly need a duplicate */
	if (!rbin)
	    rbin = bin_new(io, 0, 0, 0, GT_Bin);

	/* Initialise with duplicate values from left bin */
	bin_dup = get_bin(io, rbin);
	bin_dup = cache_rw(io, bin_dup);
	bin_dup->size = bin->size;
	bin_dup->pos = bin->pos;
	bin_dup->parent = pright;
	bin_dup->parent_type = (pright == cr->rec ? GT_Contig : GT_Bin);
	bin_dup->flags = bin->flags | BIN_BIN_UPDATED;
	bin_dup->start_used = bin->start_used;
	bin_dup->end_used = bin->end_used;

	/*
	 * Shift bin to offset if it's the contig root.
	 * It'll be shifted back by the correct amount later.
	 */
	if (pright == cr->rec) {
	    printf("moving root bin to offset=%d comp=%d\n", offset, complement);
	    bin_dup->pos = offset;
	}

	printf("%*sCreated dup for right, rec %"PRIrec"\n",
	       level*4,"", bin_dup->rec);
	break_contig_move_bin(io, bin_dup, cl, 0, cr, pright, child_no);
	opright = pright;
	pright = bin_dup->rec;
    } else {
	bin_dup = NULL;
	pleft = bin->rec;
    }

    if (!bin->rng) {
	/* Empty bin */
	printf("%*sEMPTY range\n", level*4, "");
	bin->start_used = bin->end_used = 0;
	bin->flags |= BIN_BIN_UPDATED;
	if (bin_dup) {
	    bin_dup->start_used = bin_dup->end_used = 0;
	    bin_dup->flags |= BIN_BIN_UPDATED;
	}
	    
    } else if (NMIN(bin->start_used, bin->end_used) >= pos) {
	/* Move range to right contig */
	printf("%*sDUP %"PRIrec", MOVE Array to right\n",
	       level*4, "", bin_dup->rec);

	bin_dup->rng = bin->rng;
	bin_dup->rng_rec = bin->rng_rec;
	bin_dup->rng_free = bin->rng_free;
	if (bin_dup->rng_rec)
	    bin_dup->flags |= BIN_RANGE_UPDATED;

	if (bin->rec != bin_dup->rec) {
	    bin->rng = NULL;
	    bin->rng_rec = 0;
	    bin->rng_free = -1;
	    bin->flags |= BIN_BIN_UPDATED;
	}

	bin->start_used = bin->end_used = 0;
	break_contig_reparent_seqs(io, bin_dup);

	if (bin_dup->rng) {
	    int n = ArrayMax(bin_dup->rng);
	    for (i = j = 0; i < n; i++) {
		range_t *r = arrp(range_t, bin_dup->rng, i), *r2;
		if (r->flags & GRANGE_FLAG_UNUSED)
		    continue;

		if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) {
		    HacheData hd; hd.i = 1;
		    HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL);
		    j++;
		}
	    }
	    bin_incr_nseq(io, bin_dup, j);
	}
    } else if (NMAX(bin->start_used, bin->end_used) < pos) {
	/* Range array already in left contig, so do nothing */
	printf("%*sMOVE Array to left\n", level*4, "");

	if (bin_dup)
	    bin_dup->start_used = bin_dup->end_used = 0;

	if (bin->rng) {
	    int n = ArrayMax(bin->rng);
	    for (i = j = 0; i < n; i++) {
		range_t *r = arrp(range_t, bin->rng, i);
		if (r->flags & GRANGE_FLAG_UNUSED)
		    continue;

		if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) {
		    HacheData hd; hd.i = 0;
		    HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL);
		    j++;
		}
	    }
	    bin_incr_nseq(io, bin, j);
	}
    } else {
	/* Range array covers pos, so split in two */
	int n, nl = 0, nr = 0;
	int lmin = bin->size, lmax = 0, rmin = bin->size, rmax = 0;

	printf("%*sDUP %"PRIrec", SPLIT array\n", level*4, "", bin_dup->rec);

	bin->flags |= BIN_RANGE_UPDATED;
	bin_dup->flags |= BIN_RANGE_UPDATED;

	bin_dup->rng = ArrayCreate(sizeof(range_t), 0);
	bin_dup->rng_free = -1;

	/* Pass 1 - hash sequences */
	n = ArrayMax(bin->rng);
	for (i = 0; i < n; i++) {
	    range_t *r = arrp(range_t, bin->rng, i);
	    int cstart; /* clipped sequence positions */
	    seq_t *s;

	    if (r->flags & GRANGE_FLAG_UNUSED)
		continue;

	    if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)
		continue;

	    s = (seq_t *)cache_search(io, GT_Seq, r->rec);
	    if ((s->len < 0) ^ complement) {
		cstart = NMAX(r->start, r->end) - (s->right-1);
	    } else {
		cstart = NMIN(r->start, r->end) + s->left-1;
	    }
	    
	    if (cstart >= pos)  {
		HacheData hd; hd.i = 1;
		HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL);
	    } else {
		HacheData hd; hd.i = 0;
		HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL);
	    }
	}
	
	/* Pass 2 - do the moving of anno/seqs */
	n = ArrayMax(bin->rng);
	for (i = j = 0; i < n; i++) {
	    range_t *r = arrp(range_t, bin->rng, i), *r2;
	    int cstart; /* clipped sequence positions */

	    if (r->flags & GRANGE_FLAG_UNUSED)
		continue;

	    if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) {
		cstart = NMAX(r->start, r->end);
	    } else {
		seq_t *s = (seq_t *)cache_search(io, GT_Seq, r->rec);
		if ((s->len < 0) ^ complement) {
		    cstart = NMAX(r->start, r->end) - (s->right-1);
		} else {
		    cstart = NMIN(r->start, r->end) + s->left-1;
		}
	    }
	    
	    if (cstart >= pos &&
		((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)) {
		anno_ele_t *a = (anno_ele_t *)cache_search(io,
							   GT_AnnoEle,
							   r->rec);
		/* If it's an annotation on a sequence < pos then we
		 * still don't move.
		 *
		 * FIXME: we have no guarantee that the sequence being
		 * annotated is in the same bin as this annotation, as
		 * they may be different sizes and end up in different
		 * bins. (Should we enforce anno always in same bin as seq?
		 * If so, consensus annos fit anywhere?)
		 */
		if (a->obj_type == GT_Seq) {
		    HacheItem *hi = HacheTableSearch(h,
						     (char *)&r->pair_rec,
						     sizeof(r->pair_rec));

		    if (hi) {
			if (hi->data.i == 0)
			    cstart = pos-1;
		    } else {
			puts("FIXME: annotation for seq in unknown place - "
			     "work out correct location and move if needed.");
		    }
		}
	    }

	    if (cstart >= pos) {
		r2 = (range_t *)ArrayRef(bin_dup->rng, ArrayMax(bin_dup->rng));
		*r2 = *r;
		if (rmin > r->start) rmin = r->start;
		if (rmin > r->end)   rmin = r->end;
		if (rmax < r->start) rmax = r->start;
		if (rmax < r->end)   rmax = r->end;
		if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ)
		    nr++;
	    } else {
		if (lmin > r->start) lmin = r->start;
		if (lmin > r->end)   lmin = r->end;
		if (lmax < r->start) lmax = r->start;
		if (lmax < r->end)   lmax = r->end;

		if (j != i) {
		    r2 = arrp(range_t, bin->rng, j);
		    *r2 = *r;
		}
		j++;
		if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ)
		    nl++;
	    }
	}
	bin_incr_nseq(io, bin, nl);
	bin_incr_nseq(io, bin_dup, nr);


	ArrayMax(bin->rng) = j;

#if 0
	/*
	 * Right now this causes problems, but I'm not sure why. Try again
	 * after we've fixed the bin->nseqs issues and other deallocation
	 * woes.
	 */

	if (ArrayMax(bin_dup->rng) == 0 && bin_dup->parent_type == GT_Bin) {
	    /* We didn't need it afterall! Odd. */
	    bin_index_t *pb;

	    printf("Purging bin %d that we didn't need afterall\n",
		   bin_dup->rec);
	    cache_rec_deallocate(io, GT_Bin, bin_dup->rec);
	    pb = cache_search(io, GT_Bin, bin_dup->parent);
	    if (pb->child[0] == bin_dup->rec)
		pb->child[0] = 0;
	    if (pb->child[1] == bin_dup->rec)
		pb->child[1] = 0;
	    bin_dup = NULL;
	    pright = opright;
	}
#endif

	if (bin_dup)
	    break_contig_reparent_seqs(io, bin_dup);

	if (lmin < lmax) {
	    bin->start_used     = lmin;
	    bin->end_used       = lmax;
	} else {
	    /* No data left in bin */
	    bin->start_used = 0;
	    bin->end_used = 0;
	}

	printf("%*sLeft=>%d..%d right=>%d..%d\n", level*4, "",
	       lmin, lmax, rmin, rmax);

	if (bin_dup) {
	    if (rmin < rmax) {
		bin_dup->start_used = rmin;
		bin_dup->end_used   = rmax;
	    } else {
		/* No data moved in bin */
		bin_dup->start_used = 0;
		bin_dup->end_used   = 0;
	    }
	}
    }


    /* Recurse */
    for (i = 0; i < 2; i++) {
	bin_index_t *ch;
	if (!bin->child[i])
	    continue;

	ch = get_bin(io, bin->child[i]);
	if (0 != break_contig_recurse(io, h, cl, cr, bin->child[i], pos,
				      NMIN(ch->pos, ch->pos + ch->size-1),
				      level+1, pleft, pright,
				      i, complement))
	    return -1;
    }

    cache_decr(io, bin);
    //    if (bin_dup)
    //	cache_decr(io, bin_dup);

    return 0;
}
Exemple #3
0
int main(int argc, char **argv)
{
	int n_tmor,n_nml,ncol,nbins;
	double  *tumor, *normal;
	SRM_binning args; 
	BIN_LIST bins = ll_new();
	
	args = option_assign(argc,argv);
	
	if(args.inbin_file==NULL){ /*input is not binned data*/
		tumor = read_table(args.in_tmor,&n_tmor,&ncol,-1,0);
		if(tumor==NULL) { fprintf(stderr,"Warning: the file %s is empty.\n",args.tumor_file);exit(1);}
		if(ncol!=1) {fprintf(stderr,"Error: tumor file has multiple columns.\n"); exit(1);}
		fprintf(stderr,"%d tumor reads loaded\n",n_tmor);

		normal = read_table(args.in_nml,&n_nml,&ncol,-1,0);
		if(normal==NULL)  { fprintf(stderr,"Warning: the file %s is empty.\n",args.normal_file);exit(1);}
		if(ncol!=1) { fprintf(stderr,"Error: normal file has multiple columns.\n"); exit(1);}
		fprintf(stderr,"%d normal reads loaded\n",n_nml);


		if(args.fdr==1&&args.resampling==1){
                        qsort(tumor,n_tmor,sizeof(double),cmpdouble);
                        fprintf(stderr,"sorted %d case reads\n",n_tmor);
                        qsort(normal,n_nml,sizeof(double),cmpdouble);
                        fprintf(stderr,"sorted %d control reads.\n",n_nml);
                        check_read(tumor,n_tmor);
                        check_read(normal,n_nml);

                        fprintf(stderr,"Start resampling\n");
                        bic_seq_resample(tumor,n_tmor,normal,n_nml,args);
                        free(tumor);tumor=NULL;
                        free(normal);normal = NULL;		
			}

		else{
			fprintf(stderr,"Binning\n");
			bins = sort_rms_binning(tumor,n_tmor,normal,n_nml,args.bin_size,&nbins,args.win_size,args.quantile,args.multple,args.outlier,args.tumor_file,args.normal_file);
			/*sort, remove singular positions and bin*/

			if(args.binout_file!=NULL){
				fprintf(stderr,"Reporting Binned data\n");
				BIN_LIST_print(bins, args.outbin);
				}

			if(args.fdr!=1){
				set_BinList(bins);
				fprintf(stderr,"Merging\n");
				if(args.autoselect_lambda!=1){
					bic_seq(args.paired);
					}else{
					bic_seq_auto(ll_length(bins),args.FP, args.paired);
					}
				bins = get_BinList();
				BIN_LIST_print(bins, args.output);
				ll_dealloc(bins);
				}else{
				SEG_PERMUTE segs = NULL;
				segs = bic_seq_perm(bins, args.tumor_freq, args.FP,args.B ,args.autoselect_lambda);
			        print_SEG_PERMUTE(segs,args.output);
			        SEG_PERMUTE_destroy(segs); segs = NULL;
				}
			}

		}else{
		double N_total=0.0, N_tumor=0.0,freq;
		int start, end, tumor, total;
	        while (fscanf(args.inbin, "%d %d %lf %d %d",
	                &tumor, &total, &freq, &start, &end) != EOF) {
	                ll_append(bins, bin_new(tumor, total, freq, start, end));
	                N_total += total; N_tumor += tumor;
                	}
		 set_totalreadcount(N_tumor,N_total-N_tumor);
		 
		if(args.fdr!=1){
			set_BinList(bins);
			fprintf(stderr,"Merging\n");
			if(args.autoselect_lambda!=1){
				bic_seq(args.paired);
				//bic_seq(0);
				}else{
				bic_seq_auto(ll_length(bins),args.FP, args.paired);
				//bic_seq_auto(ll_length(bins),args.FP, 0);
				}
			bins = get_BinList();
			BIN_LIST_print(bins, args.output);
			ll_dealloc(bins);
			}else{
			SEG_PERMUTE segs = NULL;
			segs = bic_seq_perm(bins, args.tumor_freq, args.FP,args.B ,args.autoselect_lambda);
			print_SEG_PERMUTE(segs,args.output);
			SEG_PERMUTE_destroy(segs); segs = NULL;
			}
		}

	return 0;	

}
Exemple #4
0
static void  bic_seq_resample(double *tumor, int n_tumor, double *normal, int n_nml, SRM_binning args)
{	SEG_PERMUTE segs = NULL;
	int *tumor_bin, *normal_bin, nbins;
	int n_tumor_sample, n_normal_sample,i,k, total,start,end, kmin;
	double tmp, freq, N_tumor, N_normal;
        struct timeval tv;
        int seed;

        gettimeofday(&tv, NULL);
        seed = tv.tv_sec * 1000000 + tv.tv_usec;
        seed_set(seed);
	srand48(seed);
	
	segs = SEG_PERMUTE_create(args.B);

	tmp = tumor[n_tumor-1] > normal[n_nml-1] ? tumor[n_tumor-1]:normal[n_nml-1];
	nbins = floor(tmp/args.bin_size)+10;
	nbins = nbins>10?nbins:10;
	tumor_bin = (int *) malloc(sizeof(int)*nbins);
	normal_bin = (int *)malloc(sizeof(int)*nbins);
	if(tumor_bin==NULL||normal_bin==NULL){
		fprintf(stderr,"Error in bic_seq_resample: memory allocation failed\n");
		exit(1);
		}

        tmp = tumor[0] < normal[0] ? tumor[0]:normal[0];
        kmin = (int) floor(tmp/args.bin_size)-1;
        kmin = (kmin>0? kmin:0);

	for(i=0;i<segs->size;i++){
		n_tumor_sample = rbinom(args.tumor_freq,n_tumor+n_nml);
		n_normal_sample = rbinom(1-args.tumor_freq,n_tumor+n_nml);
		random_sample(tumor, n_tumor, normal, n_nml, n_tumor_sample,  args.bin_size ,tumor_bin, nbins, args.paired, args.insert, args.sd);
		random_sample(tumor, n_tumor, normal, n_nml, n_normal_sample, args.bin_size ,normal_bin,nbins, args.paired, args.insert, args.sd);


		N_tumor=0.0; N_normal = 0.0;
		for(k=kmin;k<nbins;k++){
			start = k*args.bin_size+1;
			end = start+args.bin_size;
			total = tumor_bin[k] + normal_bin[k];
			freq = ((double) tumor_bin[k])/((double) total);
			if(total>0) ll_append(segs->bins_perm[i], bin_new(tumor_bin[k], total, freq, start, end));
			N_tumor += tumor_bin[k];
			N_normal += normal_bin[k];
			}
		set_BinList(segs->bins_perm[i]);
		set_totalreadcount(N_tumor,N_normal);

                if(args.autoselect_lambda!=1){
                        bic_seq(args.paired);
			//bic_seq(0);
                        }else{
                        bic_seq_auto(ll_length(segs->bins_perm[i]),args.FP,args.paired);
			//bic_seq_auto(ll_length(segs->bins_perm[i]),args.FP,0);
                        }
		segs->bins_perm[i] = get_BinList();
		}

	print_SEG_PERMUTE(segs,args.output);
	SEG_PERMUTE_destroy(segs); segs = NULL;
	free(tumor_bin); tumor_bin = NULL;
	free(normal_bin);normal_bin = NULL;

	return;
}