// -------------------------------------------------------------------------- //
//
void ScalarDistributionData::init(const std::vector<int> & sampleset,
                                  const Library & library)
{
    // Get the number of elements in the sample set.
    nsample_ = static_cast<int>(sampleset.size());

    // Loop through the sample set and extract the scalar value.
    for (size_t i = 0; i < sampleset.size(); ++i)
    {
        const int index    = sampleset[i];
        const double value = library.get_scalar_at(index, pos_);

        // Calculate which bin this value belongs to.
        const int bin = get_bin(value);

        // Increment the distribution at this bin.
        distribution_[bin] += 1.0;

    }

    // Normalize the distribution to 1.
    const double norm = vsum(distribution_);
    distribution_ = distribution_ / norm;

    // Calculate chi2.
    chi2_ = calculate_chi2(distribution_);
}
Beispiel #2
0
 ZMQMESSAGE_DLL_PUBLIC
 inline
 void
 get(zmq::message_t& message, T& t,  bool binary_mode)
 {
   binary_mode ? get_bin(message, t) : get(message, t);
 }
 inline
 void
 get(zmq::message_t& message, T& t,
   typename Private::DisableIf<Private::IsStr<T>::value>::type* = 0,
   typename Private::EnableIf<Private::IsRaw<T>::value>::type* = 0)
 {
   get_bin(message, t);
 }
// -------------------------------------------------------------------------- //
//
void ScalarDistributionData::weighted_analysis(const Library & library,
                                               const std::vector<BasisContainer> & weights_table,
                                               const std::string & basename) const
{
    // Setup distributions.
    std::vector<double> distribution(distribution_.size(), 0.0);
    std::vector<double> weighted_distribution(distribution_.size(), 0.0);

    // Loop through the library set and extract the scalar value.
    for (size_t i = 0; i < weights_table.size(); ++i)
    {
        const int index     = weights_table[i].index;
        const double weight = weights_table[i].weight;
        const double value  = library.get_scalar_at(index, pos_);

        // Calculate which bin this value belongs to.
        const int bin = get_bin(value);

        // Increment the distribution at this bin.
        distribution[bin] += 1.0;
        weighted_distribution[bin] += 1.0 * weight;
    }

    // Normalize the distributions to 1.
    const double norm = vsum(distribution);
    distribution = distribution / norm;
    weighted_distribution = weighted_distribution / norm;

    // Print scale unweighted and weighted to file.
    std::string filename = basename + "." + "Distribution-" + name_;
    std::ofstream outfile(filename.c_str());

    // Check that the file is ok.
    if (outfile.bad())
    {
        open_file_error(filename, LOCATION);
    }

    outfile << "        SCALE                DISTRIBUTION          WEIGHTED             REFERENCE" << std::endl;

    // Loop over bins.
    for (size_t i = 0; i < distribution.size(); ++i)
    {
        // Pull out the values.
        const double scale  = scale_[i];
        const double target = target_[i];
        const double value  = distribution[i];
        const double weighted_value = weighted_distribution[i];

        char line[300];
        sprintf(line, "%20.10f %20.10f  %20.10f %20.10f\n", scale, value, weighted_value, target);
        outfile << std::string(line);
    }
    outfile.close();
}
Beispiel #5
0
void hasher::add_scores(const cmf& hvals){
    for(size_t g = 0; g < tor_hashes.size(); g++){
        vector<vector<size_t> >& hvec = tor_hashes[g];
        for(size_t j = 0; j < (size_t)hvals.cols(); j++){
            for(size_t i = 0; i < (size_t)hvals.rows(); i++){
                //    increment corresponding hash table bin
                hvec[j][get_bin(hvals(i, j), hvec[j].size())]++;
            }
        }
    }
}
// -------------------------------------------------------------------------- //
//
void ScalarDistributionData::notify(const int from_sample,
                                    const int from_basis,
                                    const Library & library)
{
    // Determine the new value.
    const double subtr_val = library.get_scalar_at(from_sample, pos_);
    const double add_val   = library.get_scalar_at(from_basis, pos_);

    // Calculate which bin these values belong to.
    const int subtr_bin = get_bin(subtr_val);
    const int add_bin   = get_bin(add_val);

    // Copy the distribution over.
    distribution_new_ = distribution_;

    // Add and subtract. (Division by nsample here to get correct normalization)
    distribution_new_[add_bin]   += (1.0 / nsample_);
    distribution_new_[subtr_bin] -= (1.0 / nsample_);

    // Determine the new chi2.
    chi2_new_ = calculate_chi2(distribution_new_);
}
Beispiel #7
0
void HashDataPage::release_pages_recursive(
  const memory::GlobalVolatilePageResolver& page_resolver,
  memory::PageReleaseBatch* batch) {
  if (next_page_.volatile_pointer_.components.offset != 0) {
    HashDataPage* next = reinterpret_cast<HashDataPage*>(
      page_resolver.resolve_offset(next_page_.volatile_pointer_));
    ASSERT_ND(next->header().get_in_layer_level() == 0);
    ASSERT_ND(next->get_bin() == get_bin());
    next->release_pages_recursive(page_resolver, batch);
    next_page_.volatile_pointer_.components.offset = 0;
  }

  VolatilePagePointer volatile_id;
  volatile_id.word = header().page_id_;
  batch->release(volatile_id);
}
Beispiel #8
0
/*
 * Looks for redundant bins at the root containing no data and just a single
 * child.
 *
 * FIXME: We need to compensate for bin position here. Hence this function
 * is not called for now.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int remove_redundant_bins(GapIO *io, contig_t *c) {
    tg_rec bnum;

    if (!(c = cache_rw(io, c)))
	return -1;

    for (bnum = c->bin; bnum;) {
	bin_index_t *bin = get_bin(io, bnum);
	if (bin->rng || (bin->child[0] && bin->child[1]))
	    break;

	/* Empty */
	c->bin = bin->child[0] ? bin->child[0] : bin->child[1];
	printf("Remove bin %"PRIrec"\n", bin->rec);
	bnum = c->bin;
    }

    return 0;
}
Beispiel #9
0
vector<float> hasher::get_cur_scores(){
    const rmf& hvals = rand_scores;
    vector<float> scores(tor_hashes.size()-2, 0);
    for(size_t g = 2; g < tor_hashes.size(); g++){
        vector<vector<size_t> >& hvec = tor_hashes[g];
        //preceding has table gives probability that distance is more than
        //current measure
        vector<vector<size_t> >& hvecm2 = tor_hashes[g-2];
        //gval hvec = dval[i]*2
        //gval hvecm1 = dval[i]
        //gval hvecm2 = dval[i]/2
        //if(!in hvecm1), pr bad is for 2*gval = dval*2
        //if(!in hvecm2), pr bad is for 2*gval = dval
        //if(in hvec) pr good is for gval/2 = dval
        //therefore need to be minus two hash funcs
        for(size_t i = 0; i < (size_t)hvals.rows(); i++){
            float pr_far;
            size_t num_near = 0;
            size_t num_far=0;
            pr_far = 0;
            size_t num_ave = 0;
            for(size_t j = 0; j < (size_t)hvals.cols(); j++){
                //    increment corresponding hash table bin
                //    gamma/2, 2*gamma, 1/2, 1/3 sensitive
                //    for now we treat not being near as zero probability of begin near
                //    for now, hold two counts: number of vectors for which
                //    the hash function claims the rvec is near
                //    and number of times the rvec claims it is not near a vector
                //
                //    if (n_near*((1/2)/(1/3) < n_far)) then
                //    say it is near a vector. Else, it is not
                size_t num_near_tmp = hvec[j][get_bin(hvals(i, j), hvec[j].size())];
                long num_far_tmp = (long)hvecm2[j][get_bin(hvals(i, j), hvec[j].size())];
                num_near += num_near_tmp;
                //if not zero, make zero. otherwise make 1
                num_far_tmp = (num_far_tmp == 0);
                num_far += num_far_tmp;
                pr_far += (pow(0.5, num_near_tmp) + num_far_tmp);
                num_ave += (1+num_far_tmp);
            }
            //cout << num_near*1.0/hvals.cols() << endl;
            pr_far/= num_ave;
        //    if(pr_far < 0.2){
       if(num_far < 1){ 
                scores[g-2]++;
            }
        }
        scores[g-2]/=rand_scores.rows();
    }
    size_t num_bel_min = 0;

    list<rmf>::iterator beg;
    for(size_t r = 0; r < (size_t)rand_vecs.rows(); r++){
        float mindis=mnorm;
        list<rmf>::iterator lcur;
        for(lcur=in_vecs.begin(); lcur != in_vecs.end(); lcur++){
            for(size_t i = 0; i < (size_t)lcur->rows(); i++){
                float cdist = (rand_vecs.row(r) - lcur->row(i)).norm();
                if(cdist < mindis){
                    mindis = cdist;
                }
            }
        }
        if(mindis < 0.25){
            num_bel_min++;
        }
    }
   // float act_bel_min = (1.0*num_bel_min)/(1.0*rand_vecs.rows());
    cout << scores[0]*rand_vecs.rows() << ", " << num_bel_min << endl;
    return scores;
}
Beispiel #10
0
Datei: erln8.c Projekt: mkb/erln8
int main(int argc, char* argv[]) {
  // compiler will whine about it being deprecated, but taking it out
  // blows things up
  // used for GIO
  g_type_init();
  g_log_set_always_fatal(G_LOG_LEVEL_ERROR);
  g_log_set_handler(G_LOG_DOMAIN, G_LOG_LEVEL_DEBUG |
                    G_LOG_LEVEL_ERROR |
                    G_LOG_LEVEL_WARNING |
                    G_LOG_LEVEL_MESSAGE |
                    G_LOG_FLAG_RECURSION |
                    G_LOG_FLAG_FATAL,  erln8_log, NULL);
  homedir = g_getenv("ERLN8_HOME");
  if(homedir == NULL) {
    homedir = g_get_home_dir();
  } else {
    // builds will fail if ERLN8_HOME is not an absolute path
    if(homedir[0] != '/') {
      g_error("ERLN8_HOME must be an absolute path\n");
    }
  }
  g_debug("home directory = %s\n", homedir);
  gchar* basename = g_path_get_basename(argv[0]);
  g_debug("basename = %s\n", basename);
  if((!strcmp(basename, "erln8")) || (!strcmp(basename, "./erln8"))) {
    erln8(argc, argv);
    g_free(basename);
  } else {
    gchar* erl = which_erlang();
    if(erl == NULL) {
      g_message("Can't find an " ERLN8_CONFIG_FILE " file to use\n");
      g_error("No " ERLN8_CONFIG_FILE " file\n");
    }
    GHashTable* erlangs = get_erlangs();
    GHashTable* runtime_options = get_erln8();
    gchar* path = g_hash_table_lookup(erlangs, erl);
    if(path == NULL) {
      g_hash_table_destroy(erlangs);
      g_hash_table_destroy(runtime_options);
      g_error("Version of Erlang (%s) isn't configured in erln8\n",
              erl);
    }
    gchar* use_color = (gchar*)g_hash_table_lookup(runtime_options, "color");
    if(g_strcmp0(use_color, "true") == 0) {
      opt_color = TRUE;
    } else {
      opt_color = FALSE;
    }
    gchar* use_banner = (gchar*)g_hash_table_lookup(runtime_options, "banner");
    if(g_strcmp0(use_banner, "true") == 0) {
      opt_banner = TRUE;
    } else {
      opt_banner = FALSE;
    }
    gchar* s = get_bin(erl, basename);
    g_debug("%s\n",s);
    gboolean result = g_file_test(s,
                                  G_FILE_TEST_EXISTS |
                                  G_FILE_TEST_IS_REGULAR);
    g_free(basename);
    if(!result) {
      g_hash_table_destroy(erlangs);
      g_hash_table_destroy(runtime_options);
      g_free(s);
      g_error("Can't run %s, check to see if the file exists\n", s);
    }
    if(opt_banner) {
      printf("%s", red());
      printf("erln8: %s", blue());
      printf("using Erlang %s", path);
      printf("%s\n", color_reset());
    }
    g_hash_table_destroy(erlangs);
    g_hash_table_destroy(runtime_options);
    // can't free s
    execv(s, argv);
  }
  return 0;
}
double get_skip_probability(const KHMMParameters& parameters, double k_level1, double k_level2)
{
    size_t bin = get_bin(parameters, k_level1, k_level2);
    assert(bin < parameters.skip_probabilities.size());
    return parameters.skip_probabilities[bin];
}
void KHMMParameters::train()
{
    TrainingData& td = training_data;

    //
    // Profile HMM transitions
    //
    fprintf(stderr, "TRANSITIONS\n");

    size_t sum_m_not_k = get(td.state_transitions, statechar2index('M'), statechar2index('M')) + 
                         get(td.state_transitions, statechar2index('M'), statechar2index('E'));

    size_t me = get(td.state_transitions, statechar2index('M'), statechar2index('E'));
    double p_me_not_k = (double)me / sum_m_not_k;
    fprintf(stderr, "M->E|not_k: %lf\n", p_me_not_k);

    size_t sum_e = 0;
    for(int j = 0; j < td.state_transitions.n_cols; ++j) {
        sum_e += get(td.state_transitions, statechar2index('E'), j);
    }
    
    size_t ee = get(td.state_transitions, statechar2index('E'), statechar2index('E'));
    double p_ee = (double)ee / sum_e;
    fprintf(stderr, "E->E: %lf\n", p_ee);

    for(int i = 0; i < td.state_transitions.n_rows; ++i) {
        fprintf(stderr, "\t%c: ", "MEK"[i]);
        for(int j = 0; j < td.state_transitions.n_cols; ++j) {
            fprintf(stderr, "%d ", get(td.state_transitions, i, j));
        }
        fprintf(stderr, "\n");
    }

    if(sum_e == 0 || sum_m_not_k == 0) {
        // insufficient data to train, use defaults
        return;
    }

    trans_m_to_e_not_k = p_me_not_k;
    trans_e_to_e = p_ee;

    //
    // Signal-dependent skip probability
    //

    // Initialize observations with pseudocounts from the current model
    size_t num_bins = skip_probabilities.size();
    uint32_t pseudocount = 100;
    std::vector<double> total_observations(num_bins, 0.0f);
    std::vector<double> skip_observations(num_bins, 0.0f);


    for(size_t bin = 0; bin < num_bins; bin++) {
        skip_observations[bin] = skip_probabilities[bin] * pseudocount;
        total_observations[bin] = pseudocount;
    }

    for(size_t oi = 0; oi < td.kmer_transitions.size(); ++oi) {
        const KmerTransitionObservation& to = td.kmer_transitions[oi];
        bool is_skip = to.state == 'K';
        size_t bin = get_bin(*this, to.level_1, to.level_2);

        skip_observations[bin] += is_skip;
        total_observations[bin] += 1;
    }

    // Update probabilities
    for(size_t bin = 0; bin < num_bins; bin++) {
        skip_probabilities[bin] = skip_observations[bin] / total_observations[bin];
        fprintf(stderr, "SKIPLEARN -- bin[%zu] %.3lf %.3lf %.3lf\n", bin, skip_observations[bin], total_observations[bin], skip_probabilities[bin]);
    }
}
Beispiel #13
0
 inline
 void
 get(zmq::message_t& message, T& t,  bool binary_mode)
 {
   binary_mode ? get_bin(message, t) : get(message, t);
 }
Beispiel #14
0
static int break_contig_move_bin(GapIO *io, bin_index_t *bin,
				 contig_t *cfrom, tg_rec pfrom,
				 contig_t *cto,   tg_rec pto,
				 int child_no) {
    /* Add to */
    if (pto == cto->rec) {
	/* Parent is a contig */
	if (bin->rec != cto->bin) {
	    cache_rec_deallocate(io, GT_Bin, cto->rec);
	}
	cto->bin = bin->rec;
	cto->start = 1;
	cto->end = bin->size;

	bin->parent = cto->rec;
	bin->parent_type = GT_Contig;
	bin->flags |= BIN_BIN_UPDATED;

    } else {
	/* Parent is a bin */
	bin_index_t *pb;

	if (!(pb = get_bin(io, pto)))
	    return -1;
	if (!(pb = cache_rw(io, pb)))
	    return -1;

	pb->child[child_no] = bin->rec;
	pb->flags |= BIN_BIN_UPDATED;

	bin->parent = pto;
	bin->parent_type = GT_Bin;
	bin->flags |= BIN_BIN_UPDATED;
    }

    /* Remove from: NB it may not exist? */
    if (pfrom == cfrom->rec) {
	/* Parent is a contig */
	if (cfrom->bin != bin->rec) {
	    fprintf(stderr, "pfrom incorrect\n");
	    return -1;
	}

	cfrom->bin = 0;
    } else if (pfrom > 0) {
	/* Parent is a bin */
	bin_index_t *pb;

	if (!(pb = get_bin(io, pfrom)))
	    return -1;
	if (!(pb = cache_rw(io, pb)))
	    return -1;

	if (pb->child[0] != bin->rec && pb->child[1] != bin->rec) {
	    fprintf(stderr, "pfrom incorrect\n");
	    return -1;
	}

	if (!(pb = cache_rw(io, pb)))
	    return -1;
	
	if (pb->child[0] == bin->rec)
	    pb->child[0] = 0;
	else
	    pb->child[1] = 0;
	pb->flags |= BIN_BIN_UPDATED;
    }

    return 0;
}
Beispiel #15
0
static void complement_bin(GapIO *io, tg_rec bnum) {
    bin_index_t *bin = get_bin(io, bnum);
    bin->flags ^= BIN_COMPLEMENTED;
}
Beispiel #16
0
/*
 * Breaks a contig in two such that snum is the right-most reading of
 * a new contig.
 */
int break_contig(GapIO *io, tg_rec crec, int cpos) {
    contig_t *cl;
    contig_t *cr;
    int cid;
    char cname[1024], *cname_end;
    int left_end, right_start;
    bin_index_t *bin;
    int do_comp = 0;
    HacheTable *h;

    cl = (contig_t *)cache_search(io, GT_Contig, crec);

    //contig_dump_ps(io, &cl, "/tmp/tree.ps");

    /*
     * Our hash table is keyed on sequence record numbers for all sequences
     * in all bins spanning the break point. The value is either 0 or 1
     * for left/right contig.
     * 
     * The purpose of this hash is to allow us to work out whether a tag
     * belongs in the left or right contig, as a tag could start beyond the
     * break point but be attached to a sequence before the break point.
     *
     * Further complicating this is that a tag could be in a smaller bin
     * than the sequence as it may not be as long. However we know
     * we'll recurse down these in a logical order so we can be sure
     * we've already "seen" the sequence that the tag has been
     * attached to.
     */
    h = HacheTableCreate(1024, HASH_DYNAMIC_SIZE);

    strncpy(cname, contig_get_name(&cl), 1000);
    cname_end = cname + strlen(cname);
    cid = 1;
    do {
	sprintf(cname_end, "#%d", cid++);
    } while (contig_index_query(io, cname) > 0);

    if (!(cr = contig_new(io, cname)))
	return -1;
    cl = cache_rw(io, cl);
    cr = cache_rw(io, cr);
    if (0 != contig_index_update(io, cname, strlen(cname), cr->rec))
	return -1;
    printf("Break in contig %"PRIrec", pos %d\n", crec, cpos);

    printf("Existing left bin = %"PRIrec", right bin = %"PRIrec"\n",
	   cl->bin, cr->bin);

    cache_incr(io, cl);
    cache_incr(io, cr);

    bin = get_bin(io, cl->bin);
    do_comp = bin->flags & BIN_COMPLEMENTED;

    break_contig_recurse(io, h, cl, cr,
			 contig_get_bin(&cl), cpos, contig_offset(io, &cl),
			 0, cl->rec, cr->rec, 0, 0);

    /* Recompute end positions */
    left_end    = contig_visible_end(io, cl->rec);
    right_start = contig_visible_start(io, cr->rec);

    /* Ensure start/end positions of contigs work out */
    bin = cache_rw(io, get_bin(io, cr->bin));

    //#define KEEP_POSITIONS 1
#ifndef KEEP_POSITIONS
    cr->start = 1;
    cr->end = cl->end - right_start + 1;
    bin->pos -= right_start-1;
#else
    cr->start = right_start;
    cr->end = cl->end;
#endif

    if ((do_comp && !(bin->flags & BIN_COMPLEMENTED)) ||
	(!do_comp && (bin->flags & BIN_COMPLEMENTED))) {
	bin->flags ^= BIN_COMPLEMENTED;
    }

    cl->end = left_end;

    //    remove_redundant_bins(io, cl);
    //    remove_redundant_bins(io, cr);

    printf("Final left bin = %"PRIrec", right bin = %"PRIrec"\n",
	   cl->bin, cr->bin);

    HacheTableDestroy(h, 0);

    //if (cl->bin) contig_dump_ps(io, &cl, "/tmp/tree_l.ps");
    //if (cr->bin) contig_dump_ps(io, &cr, "/tmp/tree_r.ps");

    cache_flush(io);

    remove_empty_bins(io, cl->rec);
    remove_empty_bins(io, cr->rec);

    /* Empty contig? If so remove it completely */
    if (cl->bin == 0) {
	printf("Removing empty contig %"PRIrec"\n", cl->rec);
	contig_destroy(io, cl->rec);
    }
    if (cr->bin == 0) {
	printf("Removing empty contig %"PRIrec"\n", cr->rec);
	contig_destroy(io, cr->rec);
    }

    cache_decr(io, cl);
    cache_decr(io, cr);

    cache_flush(io);

    return 0;
}
Beispiel #17
0
/*
 * A recursive break contig function.
 * bin_num	The current bin being moved or split.
 * pos		The contig break point.
 * offset	The absolute positional offset of this bin in original contig
 * pleft	The parent bin/contig record num in the left new contig
 * pright	The parent bin/contig record num in the right new contig
 * child_no     0 or 1 - whether this bin is the left/right child of its parent
 */
static int break_contig_recurse(GapIO *io, HacheTable *h,
				contig_t *cl, contig_t *cr,
				tg_rec bin_num, int pos, int offset,
				int level, tg_rec pleft, tg_rec pright,
				int child_no, int complement) {
    int i, j, f_a, f_b;
    tg_rec rbin;
    bin_index_t *bin = get_bin(io, bin_num), *bin_dup ;
    //int bin_min, bin_max;
    int nseqs;
    tg_rec opright; /* old pright, needed if we revert back */

    cache_incr(io, bin);

    if (bin->flags & BIN_COMPLEMENTED) {
	complement ^= 1;
    }

    if (complement) {
	f_a = -1;
	f_b = offset + bin->size-1;
    } else {
	f_a = +1;
	f_b = offset;
    }

    printf("%*sBreak offset %d pos %d => test bin %"PRIrec": %d..%d\n",
	   level*4, "",
	   offset, pos, bin->rec,
	   NMIN(bin->start_used, bin->end_used),
	   NMAX(bin->start_used, bin->end_used));

    bin = cache_rw(io, bin);
    nseqs = bin->nseqs;
    bin->nseqs = 0;

    /* Invalidate any cached data */
    bin_invalidate_track(io, bin, TRACK_ALL);
    if (bin->flags & BIN_CONS_VALID) {
	bin->flags |= BIN_BIN_UPDATED;
	bin->flags &= ~BIN_CONS_VALID;
    }

    //bin_min = bin->rng ? NMIN(bin->start_used, bin->end_used) : offset;
    //bin_max = bin->rng ? NMAX(bin->start_used, bin->end_used) : offset;

    /*
     * Add to right parent if this bin is to the right of pos,
     * or if the used portion is to the right and we have no left child.
     *
     * FIXME: Not a valid assumption!
     * The used portion of a bin is not a placeholder for the used portion
     * of all the the children beneath it. Therefore if the used portion of
     * this bin is > pos (and we have no left child) it still doesn't mean
     * that the absolute positions of the used portion of the right child
     * won't be < pos.
     */
    if (offset >= pos /*|| (bin_min >= pos && !bin->child[0])*/) {
	printf("%*sADD_TO_RIGHT pl=%"PRIrec" pr=%"PRIrec"\n",
	       level*4, "", pleft, pright);
	if (0 != break_contig_move_bin(io, bin,
				       cl, pleft, cr, pright, 
				       child_no))
	    return -1;

	bin_incr_nseq(io, bin, nseqs);
	cache_decr(io, bin);

	return 0;
    }

    /*
     * Add to left parent if this bin is entirely to the left of pos,
     * or if the used portion is to the left and we have no right child.
     */
    if (offset + bin->size < pos /*|| (bin_max < pos && !bin->child[1])*/) {
	printf("%*sADD_TO_LEFT\n", level*4, "");

	//if (0 != break_contig_move_bin(io, bin, cr, pright, cl, pleft, child_no))
	//return -1;

	bin_incr_nseq(io, bin, nseqs);
	cache_decr(io, bin);
	
	return 0;
    }

    /*
     * Nominally the bin overlaps both left and right and so needs duplicating.
     * There are cases though at the roots of our trees where duplicating is
     * unnecessary as it leads to empty bins at the root. In this case
     * we skip creating a duplicate for the right, or alternatively steal
     * the left root bin and use that instead.
     *
     * Similarly the range_t array will either be left where it is, moved to
     * the right contig, or split in half (creating a new one for the right).
     *
     * FIXED: always need this. Eg:
     *
     * |-------------empty--------------|
     * |----------------|---------------|
     * |--------|-------|--------|------|
     *             ^
     *             |
     *             break here
     *
     * In this case we need to duplicate the parent as it overlaps the left
     * bin, which may (or may not) have data that needs to end up in the right
     * hand contig. Just duplicate for now and free later on if needed.
     */
    if (1 /* always! */ || pright != cr->rec ||
	(bin->rng && NMAX(bin->start_used, bin->end_used) >= pos)) {
	//printf("NMAX=%d >= %d\n", NMAX(bin->start_used, bin->end_used), pos);

	rbin = 0;

	/* Possibly steal left contig's bin */
	if (pleft == cl->rec && NMIN(bin->start_used, bin->end_used) >= pos) {
#if 0
	    /* Currently this doesn't always work */
	    if (bin->child[1]) {
		bin_index_t *ch = get_bin(io, bin->child[1]);
		if (NMIN(ch->pos, ch->pos + ch->size-1) >= pos) {
		    rbin = cl->bin;
		    cl->bin = bin->child[0];
		}
	    }
#else
	    pleft = bin->rec;
#endif
	} else {
	    pleft = bin->rec;
	}

	/* Create new bin, or use root of contig if it's unused so far */
	if (!rbin && pright == cr->rec) {
	    rbin = cr->bin;
	}

	/* Otherwise we genuingly need a duplicate */
	if (!rbin)
	    rbin = bin_new(io, 0, 0, 0, GT_Bin);

	/* Initialise with duplicate values from left bin */
	bin_dup = get_bin(io, rbin);
	bin_dup = cache_rw(io, bin_dup);
	bin_dup->size = bin->size;
	bin_dup->pos = bin->pos;
	bin_dup->parent = pright;
	bin_dup->parent_type = (pright == cr->rec ? GT_Contig : GT_Bin);
	bin_dup->flags = bin->flags | BIN_BIN_UPDATED;
	bin_dup->start_used = bin->start_used;
	bin_dup->end_used = bin->end_used;

	/*
	 * Shift bin to offset if it's the contig root.
	 * It'll be shifted back by the correct amount later.
	 */
	if (pright == cr->rec) {
	    printf("moving root bin to offset=%d comp=%d\n", offset, complement);
	    bin_dup->pos = offset;
	}

	printf("%*sCreated dup for right, rec %"PRIrec"\n",
	       level*4,"", bin_dup->rec);
	break_contig_move_bin(io, bin_dup, cl, 0, cr, pright, child_no);
	opright = pright;
	pright = bin_dup->rec;
    } else {
	bin_dup = NULL;
	pleft = bin->rec;
    }

    if (!bin->rng) {
	/* Empty bin */
	printf("%*sEMPTY range\n", level*4, "");
	bin->start_used = bin->end_used = 0;
	bin->flags |= BIN_BIN_UPDATED;
	if (bin_dup) {
	    bin_dup->start_used = bin_dup->end_used = 0;
	    bin_dup->flags |= BIN_BIN_UPDATED;
	}
	    
    } else if (NMIN(bin->start_used, bin->end_used) >= pos) {
	/* Move range to right contig */
	printf("%*sDUP %"PRIrec", MOVE Array to right\n",
	       level*4, "", bin_dup->rec);

	bin_dup->rng = bin->rng;
	bin_dup->rng_rec = bin->rng_rec;
	bin_dup->rng_free = bin->rng_free;
	if (bin_dup->rng_rec)
	    bin_dup->flags |= BIN_RANGE_UPDATED;

	if (bin->rec != bin_dup->rec) {
	    bin->rng = NULL;
	    bin->rng_rec = 0;
	    bin->rng_free = -1;
	    bin->flags |= BIN_BIN_UPDATED;
	}

	bin->start_used = bin->end_used = 0;
	break_contig_reparent_seqs(io, bin_dup);

	if (bin_dup->rng) {
	    int n = ArrayMax(bin_dup->rng);
	    for (i = j = 0; i < n; i++) {
		range_t *r = arrp(range_t, bin_dup->rng, i), *r2;
		if (r->flags & GRANGE_FLAG_UNUSED)
		    continue;

		if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) {
		    HacheData hd; hd.i = 1;
		    HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL);
		    j++;
		}
	    }
	    bin_incr_nseq(io, bin_dup, j);
	}
    } else if (NMAX(bin->start_used, bin->end_used) < pos) {
	/* Range array already in left contig, so do nothing */
	printf("%*sMOVE Array to left\n", level*4, "");

	if (bin_dup)
	    bin_dup->start_used = bin_dup->end_used = 0;

	if (bin->rng) {
	    int n = ArrayMax(bin->rng);
	    for (i = j = 0; i < n; i++) {
		range_t *r = arrp(range_t, bin->rng, i);
		if (r->flags & GRANGE_FLAG_UNUSED)
		    continue;

		if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) {
		    HacheData hd; hd.i = 0;
		    HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL);
		    j++;
		}
	    }
	    bin_incr_nseq(io, bin, j);
	}
    } else {
	/* Range array covers pos, so split in two */
	int n, nl = 0, nr = 0;
	int lmin = bin->size, lmax = 0, rmin = bin->size, rmax = 0;

	printf("%*sDUP %"PRIrec", SPLIT array\n", level*4, "", bin_dup->rec);

	bin->flags |= BIN_RANGE_UPDATED;
	bin_dup->flags |= BIN_RANGE_UPDATED;

	bin_dup->rng = ArrayCreate(sizeof(range_t), 0);
	bin_dup->rng_free = -1;

	/* Pass 1 - hash sequences */
	n = ArrayMax(bin->rng);
	for (i = 0; i < n; i++) {
	    range_t *r = arrp(range_t, bin->rng, i);
	    int cstart; /* clipped sequence positions */
	    seq_t *s;

	    if (r->flags & GRANGE_FLAG_UNUSED)
		continue;

	    if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)
		continue;

	    s = (seq_t *)cache_search(io, GT_Seq, r->rec);
	    if ((s->len < 0) ^ complement) {
		cstart = NMAX(r->start, r->end) - (s->right-1);
	    } else {
		cstart = NMIN(r->start, r->end) + s->left-1;
	    }
	    
	    if (cstart >= pos)  {
		HacheData hd; hd.i = 1;
		HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL);
	    } else {
		HacheData hd; hd.i = 0;
		HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL);
	    }
	}
	
	/* Pass 2 - do the moving of anno/seqs */
	n = ArrayMax(bin->rng);
	for (i = j = 0; i < n; i++) {
	    range_t *r = arrp(range_t, bin->rng, i), *r2;
	    int cstart; /* clipped sequence positions */

	    if (r->flags & GRANGE_FLAG_UNUSED)
		continue;

	    if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) {
		cstart = NMAX(r->start, r->end);
	    } else {
		seq_t *s = (seq_t *)cache_search(io, GT_Seq, r->rec);
		if ((s->len < 0) ^ complement) {
		    cstart = NMAX(r->start, r->end) - (s->right-1);
		} else {
		    cstart = NMIN(r->start, r->end) + s->left-1;
		}
	    }
	    
	    if (cstart >= pos &&
		((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)) {
		anno_ele_t *a = (anno_ele_t *)cache_search(io,
							   GT_AnnoEle,
							   r->rec);
		/* If it's an annotation on a sequence < pos then we
		 * still don't move.
		 *
		 * FIXME: we have no guarantee that the sequence being
		 * annotated is in the same bin as this annotation, as
		 * they may be different sizes and end up in different
		 * bins. (Should we enforce anno always in same bin as seq?
		 * If so, consensus annos fit anywhere?)
		 */
		if (a->obj_type == GT_Seq) {
		    HacheItem *hi = HacheTableSearch(h,
						     (char *)&r->pair_rec,
						     sizeof(r->pair_rec));

		    if (hi) {
			if (hi->data.i == 0)
			    cstart = pos-1;
		    } else {
			puts("FIXME: annotation for seq in unknown place - "
			     "work out correct location and move if needed.");
		    }
		}
	    }

	    if (cstart >= pos) {
		r2 = (range_t *)ArrayRef(bin_dup->rng, ArrayMax(bin_dup->rng));
		*r2 = *r;
		if (rmin > r->start) rmin = r->start;
		if (rmin > r->end)   rmin = r->end;
		if (rmax < r->start) rmax = r->start;
		if (rmax < r->end)   rmax = r->end;
		if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ)
		    nr++;
	    } else {
		if (lmin > r->start) lmin = r->start;
		if (lmin > r->end)   lmin = r->end;
		if (lmax < r->start) lmax = r->start;
		if (lmax < r->end)   lmax = r->end;

		if (j != i) {
		    r2 = arrp(range_t, bin->rng, j);
		    *r2 = *r;
		}
		j++;
		if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ)
		    nl++;
	    }
	}
	bin_incr_nseq(io, bin, nl);
	bin_incr_nseq(io, bin_dup, nr);


	ArrayMax(bin->rng) = j;

#if 0
	/*
	 * Right now this causes problems, but I'm not sure why. Try again
	 * after we've fixed the bin->nseqs issues and other deallocation
	 * woes.
	 */

	if (ArrayMax(bin_dup->rng) == 0 && bin_dup->parent_type == GT_Bin) {
	    /* We didn't need it afterall! Odd. */
	    bin_index_t *pb;

	    printf("Purging bin %d that we didn't need afterall\n",
		   bin_dup->rec);
	    cache_rec_deallocate(io, GT_Bin, bin_dup->rec);
	    pb = cache_search(io, GT_Bin, bin_dup->parent);
	    if (pb->child[0] == bin_dup->rec)
		pb->child[0] = 0;
	    if (pb->child[1] == bin_dup->rec)
		pb->child[1] = 0;
	    bin_dup = NULL;
	    pright = opright;
	}
#endif

	if (bin_dup)
	    break_contig_reparent_seqs(io, bin_dup);

	if (lmin < lmax) {
	    bin->start_used     = lmin;
	    bin->end_used       = lmax;
	} else {
	    /* No data left in bin */
	    bin->start_used = 0;
	    bin->end_used = 0;
	}

	printf("%*sLeft=>%d..%d right=>%d..%d\n", level*4, "",
	       lmin, lmax, rmin, rmax);

	if (bin_dup) {
	    if (rmin < rmax) {
		bin_dup->start_used = rmin;
		bin_dup->end_used   = rmax;
	    } else {
		/* No data moved in bin */
		bin_dup->start_used = 0;
		bin_dup->end_used   = 0;
	    }
	}
    }


    /* Recurse */
    for (i = 0; i < 2; i++) {
	bin_index_t *ch;
	if (!bin->child[i])
	    continue;

	ch = get_bin(io, bin->child[i]);
	if (0 != break_contig_recurse(io, h, cl, cr, bin->child[i], pos,
				      NMIN(ch->pos, ch->pos + ch->size-1),
				      level+1, pleft, pright,
				      i, complement))
	    return -1;
    }

    cache_decr(io, bin);
    //    if (bin_dup)
    //	cache_decr(io, bin_dup);

    return 0;
}