BIN_LIST binning(int *tumor_1bp_bin, int n_tumor, int *normal_1bp_bin, int n_normal,int bin_size) { int i,nrow,max_pos,min_pos; int start,end ,i_tum, i_norm; double tumorcnt, normalcnt,freq,total, N_tumor, N_normal; BIN_LIST bins = ll_new(); max_pos = (tumor_1bp_bin[2*(n_tumor-1)]>normal_1bp_bin[2*(n_normal-1)])? tumor_1bp_bin[2*(n_tumor-1)]: normal_1bp_bin[2*(n_normal-1)]; min_pos = (tumor_1bp_bin[0]<normal_1bp_bin[0])?tumor_1bp_bin[0]:normal_1bp_bin[0]; nrow = (max_pos-min_pos+1)/bin_size+10; N_tumor = 0.0; N_normal = 0.0; i_tum = 0; i_norm = 0; start = min_pos - (min_pos-1)%bin_size;/*the start position of the left most bin*/ end = start + bin_size -1; for(i=0;i<nrow;i++) { tumorcnt = 0.0; normalcnt = 0.0; while(tumor_1bp_bin[2*i_tum]<=end&&i_tum<n_tumor) { tumorcnt += tumor_1bp_bin[2*i_tum+1]; i_tum++; } while(normal_1bp_bin[2*i_norm]<=end&&i_norm<n_normal) { normalcnt += normal_1bp_bin[2*i_norm+1]; i_norm++; } total = tumorcnt + normalcnt; freq = tumorcnt/total; if(total>0.0) ll_append(bins, bin_new(tumorcnt, total, freq, start, end)); N_tumor += tumorcnt; N_normal += normalcnt; start += bin_size; end = start + bin_size -1; } set_totalreadcount(N_tumor,N_normal); return bins; }
/* * A recursive break contig function. * bin_num The current bin being moved or split. * pos The contig break point. * offset The absolute positional offset of this bin in original contig * pleft The parent bin/contig record num in the left new contig * pright The parent bin/contig record num in the right new contig * child_no 0 or 1 - whether this bin is the left/right child of its parent */ static int break_contig_recurse(GapIO *io, HacheTable *h, contig_t *cl, contig_t *cr, tg_rec bin_num, int pos, int offset, int level, tg_rec pleft, tg_rec pright, int child_no, int complement) { int i, j, f_a, f_b; tg_rec rbin; bin_index_t *bin = get_bin(io, bin_num), *bin_dup ; //int bin_min, bin_max; int nseqs; tg_rec opright; /* old pright, needed if we revert back */ cache_incr(io, bin); if (bin->flags & BIN_COMPLEMENTED) { complement ^= 1; } if (complement) { f_a = -1; f_b = offset + bin->size-1; } else { f_a = +1; f_b = offset; } printf("%*sBreak offset %d pos %d => test bin %"PRIrec": %d..%d\n", level*4, "", offset, pos, bin->rec, NMIN(bin->start_used, bin->end_used), NMAX(bin->start_used, bin->end_used)); bin = cache_rw(io, bin); nseqs = bin->nseqs; bin->nseqs = 0; /* Invalidate any cached data */ bin_invalidate_track(io, bin, TRACK_ALL); if (bin->flags & BIN_CONS_VALID) { bin->flags |= BIN_BIN_UPDATED; bin->flags &= ~BIN_CONS_VALID; } //bin_min = bin->rng ? NMIN(bin->start_used, bin->end_used) : offset; //bin_max = bin->rng ? NMAX(bin->start_used, bin->end_used) : offset; /* * Add to right parent if this bin is to the right of pos, * or if the used portion is to the right and we have no left child. * * FIXME: Not a valid assumption! * The used portion of a bin is not a placeholder for the used portion * of all the the children beneath it. Therefore if the used portion of * this bin is > pos (and we have no left child) it still doesn't mean * that the absolute positions of the used portion of the right child * won't be < pos. */ if (offset >= pos /*|| (bin_min >= pos && !bin->child[0])*/) { printf("%*sADD_TO_RIGHT pl=%"PRIrec" pr=%"PRIrec"\n", level*4, "", pleft, pright); if (0 != break_contig_move_bin(io, bin, cl, pleft, cr, pright, child_no)) return -1; bin_incr_nseq(io, bin, nseqs); cache_decr(io, bin); return 0; } /* * Add to left parent if this bin is entirely to the left of pos, * or if the used portion is to the left and we have no right child. */ if (offset + bin->size < pos /*|| (bin_max < pos && !bin->child[1])*/) { printf("%*sADD_TO_LEFT\n", level*4, ""); //if (0 != break_contig_move_bin(io, bin, cr, pright, cl, pleft, child_no)) //return -1; bin_incr_nseq(io, bin, nseqs); cache_decr(io, bin); return 0; } /* * Nominally the bin overlaps both left and right and so needs duplicating. * There are cases though at the roots of our trees where duplicating is * unnecessary as it leads to empty bins at the root. In this case * we skip creating a duplicate for the right, or alternatively steal * the left root bin and use that instead. * * Similarly the range_t array will either be left where it is, moved to * the right contig, or split in half (creating a new one for the right). * * FIXED: always need this. Eg: * * |-------------empty--------------| * |----------------|---------------| * |--------|-------|--------|------| * ^ * | * break here * * In this case we need to duplicate the parent as it overlaps the left * bin, which may (or may not) have data that needs to end up in the right * hand contig. Just duplicate for now and free later on if needed. */ if (1 /* always! */ || pright != cr->rec || (bin->rng && NMAX(bin->start_used, bin->end_used) >= pos)) { //printf("NMAX=%d >= %d\n", NMAX(bin->start_used, bin->end_used), pos); rbin = 0; /* Possibly steal left contig's bin */ if (pleft == cl->rec && NMIN(bin->start_used, bin->end_used) >= pos) { #if 0 /* Currently this doesn't always work */ if (bin->child[1]) { bin_index_t *ch = get_bin(io, bin->child[1]); if (NMIN(ch->pos, ch->pos + ch->size-1) >= pos) { rbin = cl->bin; cl->bin = bin->child[0]; } } #else pleft = bin->rec; #endif } else { pleft = bin->rec; } /* Create new bin, or use root of contig if it's unused so far */ if (!rbin && pright == cr->rec) { rbin = cr->bin; } /* Otherwise we genuingly need a duplicate */ if (!rbin) rbin = bin_new(io, 0, 0, 0, GT_Bin); /* Initialise with duplicate values from left bin */ bin_dup = get_bin(io, rbin); bin_dup = cache_rw(io, bin_dup); bin_dup->size = bin->size; bin_dup->pos = bin->pos; bin_dup->parent = pright; bin_dup->parent_type = (pright == cr->rec ? GT_Contig : GT_Bin); bin_dup->flags = bin->flags | BIN_BIN_UPDATED; bin_dup->start_used = bin->start_used; bin_dup->end_used = bin->end_used; /* * Shift bin to offset if it's the contig root. * It'll be shifted back by the correct amount later. */ if (pright == cr->rec) { printf("moving root bin to offset=%d comp=%d\n", offset, complement); bin_dup->pos = offset; } printf("%*sCreated dup for right, rec %"PRIrec"\n", level*4,"", bin_dup->rec); break_contig_move_bin(io, bin_dup, cl, 0, cr, pright, child_no); opright = pright; pright = bin_dup->rec; } else { bin_dup = NULL; pleft = bin->rec; } if (!bin->rng) { /* Empty bin */ printf("%*sEMPTY range\n", level*4, ""); bin->start_used = bin->end_used = 0; bin->flags |= BIN_BIN_UPDATED; if (bin_dup) { bin_dup->start_used = bin_dup->end_used = 0; bin_dup->flags |= BIN_BIN_UPDATED; } } else if (NMIN(bin->start_used, bin->end_used) >= pos) { /* Move range to right contig */ printf("%*sDUP %"PRIrec", MOVE Array to right\n", level*4, "", bin_dup->rec); bin_dup->rng = bin->rng; bin_dup->rng_rec = bin->rng_rec; bin_dup->rng_free = bin->rng_free; if (bin_dup->rng_rec) bin_dup->flags |= BIN_RANGE_UPDATED; if (bin->rec != bin_dup->rec) { bin->rng = NULL; bin->rng_rec = 0; bin->rng_free = -1; bin->flags |= BIN_BIN_UPDATED; } bin->start_used = bin->end_used = 0; break_contig_reparent_seqs(io, bin_dup); if (bin_dup->rng) { int n = ArrayMax(bin_dup->rng); for (i = j = 0; i < n; i++) { range_t *r = arrp(range_t, bin_dup->rng, i), *r2; if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) { HacheData hd; hd.i = 1; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL); j++; } } bin_incr_nseq(io, bin_dup, j); } } else if (NMAX(bin->start_used, bin->end_used) < pos) { /* Range array already in left contig, so do nothing */ printf("%*sMOVE Array to left\n", level*4, ""); if (bin_dup) bin_dup->start_used = bin_dup->end_used = 0; if (bin->rng) { int n = ArrayMax(bin->rng); for (i = j = 0; i < n; i++) { range_t *r = arrp(range_t, bin->rng, i); if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) { HacheData hd; hd.i = 0; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL); j++; } } bin_incr_nseq(io, bin, j); } } else { /* Range array covers pos, so split in two */ int n, nl = 0, nr = 0; int lmin = bin->size, lmax = 0, rmin = bin->size, rmax = 0; printf("%*sDUP %"PRIrec", SPLIT array\n", level*4, "", bin_dup->rec); bin->flags |= BIN_RANGE_UPDATED; bin_dup->flags |= BIN_RANGE_UPDATED; bin_dup->rng = ArrayCreate(sizeof(range_t), 0); bin_dup->rng_free = -1; /* Pass 1 - hash sequences */ n = ArrayMax(bin->rng); for (i = 0; i < n; i++) { range_t *r = arrp(range_t, bin->rng, i); int cstart; /* clipped sequence positions */ seq_t *s; if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) continue; s = (seq_t *)cache_search(io, GT_Seq, r->rec); if ((s->len < 0) ^ complement) { cstart = NMAX(r->start, r->end) - (s->right-1); } else { cstart = NMIN(r->start, r->end) + s->left-1; } if (cstart >= pos) { HacheData hd; hd.i = 1; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL); } else { HacheData hd; hd.i = 0; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL); } } /* Pass 2 - do the moving of anno/seqs */ n = ArrayMax(bin->rng); for (i = j = 0; i < n; i++) { range_t *r = arrp(range_t, bin->rng, i), *r2; int cstart; /* clipped sequence positions */ if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) { cstart = NMAX(r->start, r->end); } else { seq_t *s = (seq_t *)cache_search(io, GT_Seq, r->rec); if ((s->len < 0) ^ complement) { cstart = NMAX(r->start, r->end) - (s->right-1); } else { cstart = NMIN(r->start, r->end) + s->left-1; } } if (cstart >= pos && ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)) { anno_ele_t *a = (anno_ele_t *)cache_search(io, GT_AnnoEle, r->rec); /* If it's an annotation on a sequence < pos then we * still don't move. * * FIXME: we have no guarantee that the sequence being * annotated is in the same bin as this annotation, as * they may be different sizes and end up in different * bins. (Should we enforce anno always in same bin as seq? * If so, consensus annos fit anywhere?) */ if (a->obj_type == GT_Seq) { HacheItem *hi = HacheTableSearch(h, (char *)&r->pair_rec, sizeof(r->pair_rec)); if (hi) { if (hi->data.i == 0) cstart = pos-1; } else { puts("FIXME: annotation for seq in unknown place - " "work out correct location and move if needed."); } } } if (cstart >= pos) { r2 = (range_t *)ArrayRef(bin_dup->rng, ArrayMax(bin_dup->rng)); *r2 = *r; if (rmin > r->start) rmin = r->start; if (rmin > r->end) rmin = r->end; if (rmax < r->start) rmax = r->start; if (rmax < r->end) rmax = r->end; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ) nr++; } else { if (lmin > r->start) lmin = r->start; if (lmin > r->end) lmin = r->end; if (lmax < r->start) lmax = r->start; if (lmax < r->end) lmax = r->end; if (j != i) { r2 = arrp(range_t, bin->rng, j); *r2 = *r; } j++; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ) nl++; } } bin_incr_nseq(io, bin, nl); bin_incr_nseq(io, bin_dup, nr); ArrayMax(bin->rng) = j; #if 0 /* * Right now this causes problems, but I'm not sure why. Try again * after we've fixed the bin->nseqs issues and other deallocation * woes. */ if (ArrayMax(bin_dup->rng) == 0 && bin_dup->parent_type == GT_Bin) { /* We didn't need it afterall! Odd. */ bin_index_t *pb; printf("Purging bin %d that we didn't need afterall\n", bin_dup->rec); cache_rec_deallocate(io, GT_Bin, bin_dup->rec); pb = cache_search(io, GT_Bin, bin_dup->parent); if (pb->child[0] == bin_dup->rec) pb->child[0] = 0; if (pb->child[1] == bin_dup->rec) pb->child[1] = 0; bin_dup = NULL; pright = opright; } #endif if (bin_dup) break_contig_reparent_seqs(io, bin_dup); if (lmin < lmax) { bin->start_used = lmin; bin->end_used = lmax; } else { /* No data left in bin */ bin->start_used = 0; bin->end_used = 0; } printf("%*sLeft=>%d..%d right=>%d..%d\n", level*4, "", lmin, lmax, rmin, rmax); if (bin_dup) { if (rmin < rmax) { bin_dup->start_used = rmin; bin_dup->end_used = rmax; } else { /* No data moved in bin */ bin_dup->start_used = 0; bin_dup->end_used = 0; } } } /* Recurse */ for (i = 0; i < 2; i++) { bin_index_t *ch; if (!bin->child[i]) continue; ch = get_bin(io, bin->child[i]); if (0 != break_contig_recurse(io, h, cl, cr, bin->child[i], pos, NMIN(ch->pos, ch->pos + ch->size-1), level+1, pleft, pright, i, complement)) return -1; } cache_decr(io, bin); // if (bin_dup) // cache_decr(io, bin_dup); return 0; }
int main(int argc, char **argv) { int n_tmor,n_nml,ncol,nbins; double *tumor, *normal; SRM_binning args; BIN_LIST bins = ll_new(); args = option_assign(argc,argv); if(args.inbin_file==NULL){ /*input is not binned data*/ tumor = read_table(args.in_tmor,&n_tmor,&ncol,-1,0); if(tumor==NULL) { fprintf(stderr,"Warning: the file %s is empty.\n",args.tumor_file);exit(1);} if(ncol!=1) {fprintf(stderr,"Error: tumor file has multiple columns.\n"); exit(1);} fprintf(stderr,"%d tumor reads loaded\n",n_tmor); normal = read_table(args.in_nml,&n_nml,&ncol,-1,0); if(normal==NULL) { fprintf(stderr,"Warning: the file %s is empty.\n",args.normal_file);exit(1);} if(ncol!=1) { fprintf(stderr,"Error: normal file has multiple columns.\n"); exit(1);} fprintf(stderr,"%d normal reads loaded\n",n_nml); if(args.fdr==1&&args.resampling==1){ qsort(tumor,n_tmor,sizeof(double),cmpdouble); fprintf(stderr,"sorted %d case reads\n",n_tmor); qsort(normal,n_nml,sizeof(double),cmpdouble); fprintf(stderr,"sorted %d control reads.\n",n_nml); check_read(tumor,n_tmor); check_read(normal,n_nml); fprintf(stderr,"Start resampling\n"); bic_seq_resample(tumor,n_tmor,normal,n_nml,args); free(tumor);tumor=NULL; free(normal);normal = NULL; } else{ fprintf(stderr,"Binning\n"); bins = sort_rms_binning(tumor,n_tmor,normal,n_nml,args.bin_size,&nbins,args.win_size,args.quantile,args.multple,args.outlier,args.tumor_file,args.normal_file); /*sort, remove singular positions and bin*/ if(args.binout_file!=NULL){ fprintf(stderr,"Reporting Binned data\n"); BIN_LIST_print(bins, args.outbin); } if(args.fdr!=1){ set_BinList(bins); fprintf(stderr,"Merging\n"); if(args.autoselect_lambda!=1){ bic_seq(args.paired); }else{ bic_seq_auto(ll_length(bins),args.FP, args.paired); } bins = get_BinList(); BIN_LIST_print(bins, args.output); ll_dealloc(bins); }else{ SEG_PERMUTE segs = NULL; segs = bic_seq_perm(bins, args.tumor_freq, args.FP,args.B ,args.autoselect_lambda); print_SEG_PERMUTE(segs,args.output); SEG_PERMUTE_destroy(segs); segs = NULL; } } }else{ double N_total=0.0, N_tumor=0.0,freq; int start, end, tumor, total; while (fscanf(args.inbin, "%d %d %lf %d %d", &tumor, &total, &freq, &start, &end) != EOF) { ll_append(bins, bin_new(tumor, total, freq, start, end)); N_total += total; N_tumor += tumor; } set_totalreadcount(N_tumor,N_total-N_tumor); if(args.fdr!=1){ set_BinList(bins); fprintf(stderr,"Merging\n"); if(args.autoselect_lambda!=1){ bic_seq(args.paired); //bic_seq(0); }else{ bic_seq_auto(ll_length(bins),args.FP, args.paired); //bic_seq_auto(ll_length(bins),args.FP, 0); } bins = get_BinList(); BIN_LIST_print(bins, args.output); ll_dealloc(bins); }else{ SEG_PERMUTE segs = NULL; segs = bic_seq_perm(bins, args.tumor_freq, args.FP,args.B ,args.autoselect_lambda); print_SEG_PERMUTE(segs,args.output); SEG_PERMUTE_destroy(segs); segs = NULL; } } return 0; }
static void bic_seq_resample(double *tumor, int n_tumor, double *normal, int n_nml, SRM_binning args) { SEG_PERMUTE segs = NULL; int *tumor_bin, *normal_bin, nbins; int n_tumor_sample, n_normal_sample,i,k, total,start,end, kmin; double tmp, freq, N_tumor, N_normal; struct timeval tv; int seed; gettimeofday(&tv, NULL); seed = tv.tv_sec * 1000000 + tv.tv_usec; seed_set(seed); srand48(seed); segs = SEG_PERMUTE_create(args.B); tmp = tumor[n_tumor-1] > normal[n_nml-1] ? tumor[n_tumor-1]:normal[n_nml-1]; nbins = floor(tmp/args.bin_size)+10; nbins = nbins>10?nbins:10; tumor_bin = (int *) malloc(sizeof(int)*nbins); normal_bin = (int *)malloc(sizeof(int)*nbins); if(tumor_bin==NULL||normal_bin==NULL){ fprintf(stderr,"Error in bic_seq_resample: memory allocation failed\n"); exit(1); } tmp = tumor[0] < normal[0] ? tumor[0]:normal[0]; kmin = (int) floor(tmp/args.bin_size)-1; kmin = (kmin>0? kmin:0); for(i=0;i<segs->size;i++){ n_tumor_sample = rbinom(args.tumor_freq,n_tumor+n_nml); n_normal_sample = rbinom(1-args.tumor_freq,n_tumor+n_nml); random_sample(tumor, n_tumor, normal, n_nml, n_tumor_sample, args.bin_size ,tumor_bin, nbins, args.paired, args.insert, args.sd); random_sample(tumor, n_tumor, normal, n_nml, n_normal_sample, args.bin_size ,normal_bin,nbins, args.paired, args.insert, args.sd); N_tumor=0.0; N_normal = 0.0; for(k=kmin;k<nbins;k++){ start = k*args.bin_size+1; end = start+args.bin_size; total = tumor_bin[k] + normal_bin[k]; freq = ((double) tumor_bin[k])/((double) total); if(total>0) ll_append(segs->bins_perm[i], bin_new(tumor_bin[k], total, freq, start, end)); N_tumor += tumor_bin[k]; N_normal += normal_bin[k]; } set_BinList(segs->bins_perm[i]); set_totalreadcount(N_tumor,N_normal); if(args.autoselect_lambda!=1){ bic_seq(args.paired); //bic_seq(0); }else{ bic_seq_auto(ll_length(segs->bins_perm[i]),args.FP,args.paired); //bic_seq_auto(ll_length(segs->bins_perm[i]),args.FP,0); } segs->bins_perm[i] = get_BinList(); } print_SEG_PERMUTE(segs,args.output); SEG_PERMUTE_destroy(segs); segs = NULL; free(tumor_bin); tumor_bin = NULL; free(normal_bin);normal_bin = NULL; return; }