// -------------------------------------------------------------------------- // // void ScalarDistributionData::init(const std::vector<int> & sampleset, const Library & library) { // Get the number of elements in the sample set. nsample_ = static_cast<int>(sampleset.size()); // Loop through the sample set and extract the scalar value. for (size_t i = 0; i < sampleset.size(); ++i) { const int index = sampleset[i]; const double value = library.get_scalar_at(index, pos_); // Calculate which bin this value belongs to. const int bin = get_bin(value); // Increment the distribution at this bin. distribution_[bin] += 1.0; } // Normalize the distribution to 1. const double norm = vsum(distribution_); distribution_ = distribution_ / norm; // Calculate chi2. chi2_ = calculate_chi2(distribution_); }
ZMQMESSAGE_DLL_PUBLIC inline void get(zmq::message_t& message, T& t, bool binary_mode) { binary_mode ? get_bin(message, t) : get(message, t); }
inline void get(zmq::message_t& message, T& t, typename Private::DisableIf<Private::IsStr<T>::value>::type* = 0, typename Private::EnableIf<Private::IsRaw<T>::value>::type* = 0) { get_bin(message, t); }
// -------------------------------------------------------------------------- // // void ScalarDistributionData::weighted_analysis(const Library & library, const std::vector<BasisContainer> & weights_table, const std::string & basename) const { // Setup distributions. std::vector<double> distribution(distribution_.size(), 0.0); std::vector<double> weighted_distribution(distribution_.size(), 0.0); // Loop through the library set and extract the scalar value. for (size_t i = 0; i < weights_table.size(); ++i) { const int index = weights_table[i].index; const double weight = weights_table[i].weight; const double value = library.get_scalar_at(index, pos_); // Calculate which bin this value belongs to. const int bin = get_bin(value); // Increment the distribution at this bin. distribution[bin] += 1.0; weighted_distribution[bin] += 1.0 * weight; } // Normalize the distributions to 1. const double norm = vsum(distribution); distribution = distribution / norm; weighted_distribution = weighted_distribution / norm; // Print scale unweighted and weighted to file. std::string filename = basename + "." + "Distribution-" + name_; std::ofstream outfile(filename.c_str()); // Check that the file is ok. if (outfile.bad()) { open_file_error(filename, LOCATION); } outfile << " SCALE DISTRIBUTION WEIGHTED REFERENCE" << std::endl; // Loop over bins. for (size_t i = 0; i < distribution.size(); ++i) { // Pull out the values. const double scale = scale_[i]; const double target = target_[i]; const double value = distribution[i]; const double weighted_value = weighted_distribution[i]; char line[300]; sprintf(line, "%20.10f %20.10f %20.10f %20.10f\n", scale, value, weighted_value, target); outfile << std::string(line); } outfile.close(); }
void hasher::add_scores(const cmf& hvals){ for(size_t g = 0; g < tor_hashes.size(); g++){ vector<vector<size_t> >& hvec = tor_hashes[g]; for(size_t j = 0; j < (size_t)hvals.cols(); j++){ for(size_t i = 0; i < (size_t)hvals.rows(); i++){ // increment corresponding hash table bin hvec[j][get_bin(hvals(i, j), hvec[j].size())]++; } } } }
// -------------------------------------------------------------------------- // // void ScalarDistributionData::notify(const int from_sample, const int from_basis, const Library & library) { // Determine the new value. const double subtr_val = library.get_scalar_at(from_sample, pos_); const double add_val = library.get_scalar_at(from_basis, pos_); // Calculate which bin these values belong to. const int subtr_bin = get_bin(subtr_val); const int add_bin = get_bin(add_val); // Copy the distribution over. distribution_new_ = distribution_; // Add and subtract. (Division by nsample here to get correct normalization) distribution_new_[add_bin] += (1.0 / nsample_); distribution_new_[subtr_bin] -= (1.0 / nsample_); // Determine the new chi2. chi2_new_ = calculate_chi2(distribution_new_); }
void HashDataPage::release_pages_recursive( const memory::GlobalVolatilePageResolver& page_resolver, memory::PageReleaseBatch* batch) { if (next_page_.volatile_pointer_.components.offset != 0) { HashDataPage* next = reinterpret_cast<HashDataPage*>( page_resolver.resolve_offset(next_page_.volatile_pointer_)); ASSERT_ND(next->header().get_in_layer_level() == 0); ASSERT_ND(next->get_bin() == get_bin()); next->release_pages_recursive(page_resolver, batch); next_page_.volatile_pointer_.components.offset = 0; } VolatilePagePointer volatile_id; volatile_id.word = header().page_id_; batch->release(volatile_id); }
/* * Looks for redundant bins at the root containing no data and just a single * child. * * FIXME: We need to compensate for bin position here. Hence this function * is not called for now. * * Returns 0 on success * -1 on failure */ int remove_redundant_bins(GapIO *io, contig_t *c) { tg_rec bnum; if (!(c = cache_rw(io, c))) return -1; for (bnum = c->bin; bnum;) { bin_index_t *bin = get_bin(io, bnum); if (bin->rng || (bin->child[0] && bin->child[1])) break; /* Empty */ c->bin = bin->child[0] ? bin->child[0] : bin->child[1]; printf("Remove bin %"PRIrec"\n", bin->rec); bnum = c->bin; } return 0; }
vector<float> hasher::get_cur_scores(){ const rmf& hvals = rand_scores; vector<float> scores(tor_hashes.size()-2, 0); for(size_t g = 2; g < tor_hashes.size(); g++){ vector<vector<size_t> >& hvec = tor_hashes[g]; //preceding has table gives probability that distance is more than //current measure vector<vector<size_t> >& hvecm2 = tor_hashes[g-2]; //gval hvec = dval[i]*2 //gval hvecm1 = dval[i] //gval hvecm2 = dval[i]/2 //if(!in hvecm1), pr bad is for 2*gval = dval*2 //if(!in hvecm2), pr bad is for 2*gval = dval //if(in hvec) pr good is for gval/2 = dval //therefore need to be minus two hash funcs for(size_t i = 0; i < (size_t)hvals.rows(); i++){ float pr_far; size_t num_near = 0; size_t num_far=0; pr_far = 0; size_t num_ave = 0; for(size_t j = 0; j < (size_t)hvals.cols(); j++){ // increment corresponding hash table bin // gamma/2, 2*gamma, 1/2, 1/3 sensitive // for now we treat not being near as zero probability of begin near // for now, hold two counts: number of vectors for which // the hash function claims the rvec is near // and number of times the rvec claims it is not near a vector // // if (n_near*((1/2)/(1/3) < n_far)) then // say it is near a vector. Else, it is not size_t num_near_tmp = hvec[j][get_bin(hvals(i, j), hvec[j].size())]; long num_far_tmp = (long)hvecm2[j][get_bin(hvals(i, j), hvec[j].size())]; num_near += num_near_tmp; //if not zero, make zero. otherwise make 1 num_far_tmp = (num_far_tmp == 0); num_far += num_far_tmp; pr_far += (pow(0.5, num_near_tmp) + num_far_tmp); num_ave += (1+num_far_tmp); } //cout << num_near*1.0/hvals.cols() << endl; pr_far/= num_ave; // if(pr_far < 0.2){ if(num_far < 1){ scores[g-2]++; } } scores[g-2]/=rand_scores.rows(); } size_t num_bel_min = 0; list<rmf>::iterator beg; for(size_t r = 0; r < (size_t)rand_vecs.rows(); r++){ float mindis=mnorm; list<rmf>::iterator lcur; for(lcur=in_vecs.begin(); lcur != in_vecs.end(); lcur++){ for(size_t i = 0; i < (size_t)lcur->rows(); i++){ float cdist = (rand_vecs.row(r) - lcur->row(i)).norm(); if(cdist < mindis){ mindis = cdist; } } } if(mindis < 0.25){ num_bel_min++; } } // float act_bel_min = (1.0*num_bel_min)/(1.0*rand_vecs.rows()); cout << scores[0]*rand_vecs.rows() << ", " << num_bel_min << endl; return scores; }
int main(int argc, char* argv[]) { // compiler will whine about it being deprecated, but taking it out // blows things up // used for GIO g_type_init(); g_log_set_always_fatal(G_LOG_LEVEL_ERROR); g_log_set_handler(G_LOG_DOMAIN, G_LOG_LEVEL_DEBUG | G_LOG_LEVEL_ERROR | G_LOG_LEVEL_WARNING | G_LOG_LEVEL_MESSAGE | G_LOG_FLAG_RECURSION | G_LOG_FLAG_FATAL, erln8_log, NULL); homedir = g_getenv("ERLN8_HOME"); if(homedir == NULL) { homedir = g_get_home_dir(); } else { // builds will fail if ERLN8_HOME is not an absolute path if(homedir[0] != '/') { g_error("ERLN8_HOME must be an absolute path\n"); } } g_debug("home directory = %s\n", homedir); gchar* basename = g_path_get_basename(argv[0]); g_debug("basename = %s\n", basename); if((!strcmp(basename, "erln8")) || (!strcmp(basename, "./erln8"))) { erln8(argc, argv); g_free(basename); } else { gchar* erl = which_erlang(); if(erl == NULL) { g_message("Can't find an " ERLN8_CONFIG_FILE " file to use\n"); g_error("No " ERLN8_CONFIG_FILE " file\n"); } GHashTable* erlangs = get_erlangs(); GHashTable* runtime_options = get_erln8(); gchar* path = g_hash_table_lookup(erlangs, erl); if(path == NULL) { g_hash_table_destroy(erlangs); g_hash_table_destroy(runtime_options); g_error("Version of Erlang (%s) isn't configured in erln8\n", erl); } gchar* use_color = (gchar*)g_hash_table_lookup(runtime_options, "color"); if(g_strcmp0(use_color, "true") == 0) { opt_color = TRUE; } else { opt_color = FALSE; } gchar* use_banner = (gchar*)g_hash_table_lookup(runtime_options, "banner"); if(g_strcmp0(use_banner, "true") == 0) { opt_banner = TRUE; } else { opt_banner = FALSE; } gchar* s = get_bin(erl, basename); g_debug("%s\n",s); gboolean result = g_file_test(s, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR); g_free(basename); if(!result) { g_hash_table_destroy(erlangs); g_hash_table_destroy(runtime_options); g_free(s); g_error("Can't run %s, check to see if the file exists\n", s); } if(opt_banner) { printf("%s", red()); printf("erln8: %s", blue()); printf("using Erlang %s", path); printf("%s\n", color_reset()); } g_hash_table_destroy(erlangs); g_hash_table_destroy(runtime_options); // can't free s execv(s, argv); } return 0; }
double get_skip_probability(const KHMMParameters& parameters, double k_level1, double k_level2) { size_t bin = get_bin(parameters, k_level1, k_level2); assert(bin < parameters.skip_probabilities.size()); return parameters.skip_probabilities[bin]; }
void KHMMParameters::train() { TrainingData& td = training_data; // // Profile HMM transitions // fprintf(stderr, "TRANSITIONS\n"); size_t sum_m_not_k = get(td.state_transitions, statechar2index('M'), statechar2index('M')) + get(td.state_transitions, statechar2index('M'), statechar2index('E')); size_t me = get(td.state_transitions, statechar2index('M'), statechar2index('E')); double p_me_not_k = (double)me / sum_m_not_k; fprintf(stderr, "M->E|not_k: %lf\n", p_me_not_k); size_t sum_e = 0; for(int j = 0; j < td.state_transitions.n_cols; ++j) { sum_e += get(td.state_transitions, statechar2index('E'), j); } size_t ee = get(td.state_transitions, statechar2index('E'), statechar2index('E')); double p_ee = (double)ee / sum_e; fprintf(stderr, "E->E: %lf\n", p_ee); for(int i = 0; i < td.state_transitions.n_rows; ++i) { fprintf(stderr, "\t%c: ", "MEK"[i]); for(int j = 0; j < td.state_transitions.n_cols; ++j) { fprintf(stderr, "%d ", get(td.state_transitions, i, j)); } fprintf(stderr, "\n"); } if(sum_e == 0 || sum_m_not_k == 0) { // insufficient data to train, use defaults return; } trans_m_to_e_not_k = p_me_not_k; trans_e_to_e = p_ee; // // Signal-dependent skip probability // // Initialize observations with pseudocounts from the current model size_t num_bins = skip_probabilities.size(); uint32_t pseudocount = 100; std::vector<double> total_observations(num_bins, 0.0f); std::vector<double> skip_observations(num_bins, 0.0f); for(size_t bin = 0; bin < num_bins; bin++) { skip_observations[bin] = skip_probabilities[bin] * pseudocount; total_observations[bin] = pseudocount; } for(size_t oi = 0; oi < td.kmer_transitions.size(); ++oi) { const KmerTransitionObservation& to = td.kmer_transitions[oi]; bool is_skip = to.state == 'K'; size_t bin = get_bin(*this, to.level_1, to.level_2); skip_observations[bin] += is_skip; total_observations[bin] += 1; } // Update probabilities for(size_t bin = 0; bin < num_bins; bin++) { skip_probabilities[bin] = skip_observations[bin] / total_observations[bin]; fprintf(stderr, "SKIPLEARN -- bin[%zu] %.3lf %.3lf %.3lf\n", bin, skip_observations[bin], total_observations[bin], skip_probabilities[bin]); } }
inline void get(zmq::message_t& message, T& t, bool binary_mode) { binary_mode ? get_bin(message, t) : get(message, t); }
static int break_contig_move_bin(GapIO *io, bin_index_t *bin, contig_t *cfrom, tg_rec pfrom, contig_t *cto, tg_rec pto, int child_no) { /* Add to */ if (pto == cto->rec) { /* Parent is a contig */ if (bin->rec != cto->bin) { cache_rec_deallocate(io, GT_Bin, cto->rec); } cto->bin = bin->rec; cto->start = 1; cto->end = bin->size; bin->parent = cto->rec; bin->parent_type = GT_Contig; bin->flags |= BIN_BIN_UPDATED; } else { /* Parent is a bin */ bin_index_t *pb; if (!(pb = get_bin(io, pto))) return -1; if (!(pb = cache_rw(io, pb))) return -1; pb->child[child_no] = bin->rec; pb->flags |= BIN_BIN_UPDATED; bin->parent = pto; bin->parent_type = GT_Bin; bin->flags |= BIN_BIN_UPDATED; } /* Remove from: NB it may not exist? */ if (pfrom == cfrom->rec) { /* Parent is a contig */ if (cfrom->bin != bin->rec) { fprintf(stderr, "pfrom incorrect\n"); return -1; } cfrom->bin = 0; } else if (pfrom > 0) { /* Parent is a bin */ bin_index_t *pb; if (!(pb = get_bin(io, pfrom))) return -1; if (!(pb = cache_rw(io, pb))) return -1; if (pb->child[0] != bin->rec && pb->child[1] != bin->rec) { fprintf(stderr, "pfrom incorrect\n"); return -1; } if (!(pb = cache_rw(io, pb))) return -1; if (pb->child[0] == bin->rec) pb->child[0] = 0; else pb->child[1] = 0; pb->flags |= BIN_BIN_UPDATED; } return 0; }
static void complement_bin(GapIO *io, tg_rec bnum) { bin_index_t *bin = get_bin(io, bnum); bin->flags ^= BIN_COMPLEMENTED; }
/* * Breaks a contig in two such that snum is the right-most reading of * a new contig. */ int break_contig(GapIO *io, tg_rec crec, int cpos) { contig_t *cl; contig_t *cr; int cid; char cname[1024], *cname_end; int left_end, right_start; bin_index_t *bin; int do_comp = 0; HacheTable *h; cl = (contig_t *)cache_search(io, GT_Contig, crec); //contig_dump_ps(io, &cl, "/tmp/tree.ps"); /* * Our hash table is keyed on sequence record numbers for all sequences * in all bins spanning the break point. The value is either 0 or 1 * for left/right contig. * * The purpose of this hash is to allow us to work out whether a tag * belongs in the left or right contig, as a tag could start beyond the * break point but be attached to a sequence before the break point. * * Further complicating this is that a tag could be in a smaller bin * than the sequence as it may not be as long. However we know * we'll recurse down these in a logical order so we can be sure * we've already "seen" the sequence that the tag has been * attached to. */ h = HacheTableCreate(1024, HASH_DYNAMIC_SIZE); strncpy(cname, contig_get_name(&cl), 1000); cname_end = cname + strlen(cname); cid = 1; do { sprintf(cname_end, "#%d", cid++); } while (contig_index_query(io, cname) > 0); if (!(cr = contig_new(io, cname))) return -1; cl = cache_rw(io, cl); cr = cache_rw(io, cr); if (0 != contig_index_update(io, cname, strlen(cname), cr->rec)) return -1; printf("Break in contig %"PRIrec", pos %d\n", crec, cpos); printf("Existing left bin = %"PRIrec", right bin = %"PRIrec"\n", cl->bin, cr->bin); cache_incr(io, cl); cache_incr(io, cr); bin = get_bin(io, cl->bin); do_comp = bin->flags & BIN_COMPLEMENTED; break_contig_recurse(io, h, cl, cr, contig_get_bin(&cl), cpos, contig_offset(io, &cl), 0, cl->rec, cr->rec, 0, 0); /* Recompute end positions */ left_end = contig_visible_end(io, cl->rec); right_start = contig_visible_start(io, cr->rec); /* Ensure start/end positions of contigs work out */ bin = cache_rw(io, get_bin(io, cr->bin)); //#define KEEP_POSITIONS 1 #ifndef KEEP_POSITIONS cr->start = 1; cr->end = cl->end - right_start + 1; bin->pos -= right_start-1; #else cr->start = right_start; cr->end = cl->end; #endif if ((do_comp && !(bin->flags & BIN_COMPLEMENTED)) || (!do_comp && (bin->flags & BIN_COMPLEMENTED))) { bin->flags ^= BIN_COMPLEMENTED; } cl->end = left_end; // remove_redundant_bins(io, cl); // remove_redundant_bins(io, cr); printf("Final left bin = %"PRIrec", right bin = %"PRIrec"\n", cl->bin, cr->bin); HacheTableDestroy(h, 0); //if (cl->bin) contig_dump_ps(io, &cl, "/tmp/tree_l.ps"); //if (cr->bin) contig_dump_ps(io, &cr, "/tmp/tree_r.ps"); cache_flush(io); remove_empty_bins(io, cl->rec); remove_empty_bins(io, cr->rec); /* Empty contig? If so remove it completely */ if (cl->bin == 0) { printf("Removing empty contig %"PRIrec"\n", cl->rec); contig_destroy(io, cl->rec); } if (cr->bin == 0) { printf("Removing empty contig %"PRIrec"\n", cr->rec); contig_destroy(io, cr->rec); } cache_decr(io, cl); cache_decr(io, cr); cache_flush(io); return 0; }
/* * A recursive break contig function. * bin_num The current bin being moved or split. * pos The contig break point. * offset The absolute positional offset of this bin in original contig * pleft The parent bin/contig record num in the left new contig * pright The parent bin/contig record num in the right new contig * child_no 0 or 1 - whether this bin is the left/right child of its parent */ static int break_contig_recurse(GapIO *io, HacheTable *h, contig_t *cl, contig_t *cr, tg_rec bin_num, int pos, int offset, int level, tg_rec pleft, tg_rec pright, int child_no, int complement) { int i, j, f_a, f_b; tg_rec rbin; bin_index_t *bin = get_bin(io, bin_num), *bin_dup ; //int bin_min, bin_max; int nseqs; tg_rec opright; /* old pright, needed if we revert back */ cache_incr(io, bin); if (bin->flags & BIN_COMPLEMENTED) { complement ^= 1; } if (complement) { f_a = -1; f_b = offset + bin->size-1; } else { f_a = +1; f_b = offset; } printf("%*sBreak offset %d pos %d => test bin %"PRIrec": %d..%d\n", level*4, "", offset, pos, bin->rec, NMIN(bin->start_used, bin->end_used), NMAX(bin->start_used, bin->end_used)); bin = cache_rw(io, bin); nseqs = bin->nseqs; bin->nseqs = 0; /* Invalidate any cached data */ bin_invalidate_track(io, bin, TRACK_ALL); if (bin->flags & BIN_CONS_VALID) { bin->flags |= BIN_BIN_UPDATED; bin->flags &= ~BIN_CONS_VALID; } //bin_min = bin->rng ? NMIN(bin->start_used, bin->end_used) : offset; //bin_max = bin->rng ? NMAX(bin->start_used, bin->end_used) : offset; /* * Add to right parent if this bin is to the right of pos, * or if the used portion is to the right and we have no left child. * * FIXME: Not a valid assumption! * The used portion of a bin is not a placeholder for the used portion * of all the the children beneath it. Therefore if the used portion of * this bin is > pos (and we have no left child) it still doesn't mean * that the absolute positions of the used portion of the right child * won't be < pos. */ if (offset >= pos /*|| (bin_min >= pos && !bin->child[0])*/) { printf("%*sADD_TO_RIGHT pl=%"PRIrec" pr=%"PRIrec"\n", level*4, "", pleft, pright); if (0 != break_contig_move_bin(io, bin, cl, pleft, cr, pright, child_no)) return -1; bin_incr_nseq(io, bin, nseqs); cache_decr(io, bin); return 0; } /* * Add to left parent if this bin is entirely to the left of pos, * or if the used portion is to the left and we have no right child. */ if (offset + bin->size < pos /*|| (bin_max < pos && !bin->child[1])*/) { printf("%*sADD_TO_LEFT\n", level*4, ""); //if (0 != break_contig_move_bin(io, bin, cr, pright, cl, pleft, child_no)) //return -1; bin_incr_nseq(io, bin, nseqs); cache_decr(io, bin); return 0; } /* * Nominally the bin overlaps both left and right and so needs duplicating. * There are cases though at the roots of our trees where duplicating is * unnecessary as it leads to empty bins at the root. In this case * we skip creating a duplicate for the right, or alternatively steal * the left root bin and use that instead. * * Similarly the range_t array will either be left where it is, moved to * the right contig, or split in half (creating a new one for the right). * * FIXED: always need this. Eg: * * |-------------empty--------------| * |----------------|---------------| * |--------|-------|--------|------| * ^ * | * break here * * In this case we need to duplicate the parent as it overlaps the left * bin, which may (or may not) have data that needs to end up in the right * hand contig. Just duplicate for now and free later on if needed. */ if (1 /* always! */ || pright != cr->rec || (bin->rng && NMAX(bin->start_used, bin->end_used) >= pos)) { //printf("NMAX=%d >= %d\n", NMAX(bin->start_used, bin->end_used), pos); rbin = 0; /* Possibly steal left contig's bin */ if (pleft == cl->rec && NMIN(bin->start_used, bin->end_used) >= pos) { #if 0 /* Currently this doesn't always work */ if (bin->child[1]) { bin_index_t *ch = get_bin(io, bin->child[1]); if (NMIN(ch->pos, ch->pos + ch->size-1) >= pos) { rbin = cl->bin; cl->bin = bin->child[0]; } } #else pleft = bin->rec; #endif } else { pleft = bin->rec; } /* Create new bin, or use root of contig if it's unused so far */ if (!rbin && pright == cr->rec) { rbin = cr->bin; } /* Otherwise we genuingly need a duplicate */ if (!rbin) rbin = bin_new(io, 0, 0, 0, GT_Bin); /* Initialise with duplicate values from left bin */ bin_dup = get_bin(io, rbin); bin_dup = cache_rw(io, bin_dup); bin_dup->size = bin->size; bin_dup->pos = bin->pos; bin_dup->parent = pright; bin_dup->parent_type = (pright == cr->rec ? GT_Contig : GT_Bin); bin_dup->flags = bin->flags | BIN_BIN_UPDATED; bin_dup->start_used = bin->start_used; bin_dup->end_used = bin->end_used; /* * Shift bin to offset if it's the contig root. * It'll be shifted back by the correct amount later. */ if (pright == cr->rec) { printf("moving root bin to offset=%d comp=%d\n", offset, complement); bin_dup->pos = offset; } printf("%*sCreated dup for right, rec %"PRIrec"\n", level*4,"", bin_dup->rec); break_contig_move_bin(io, bin_dup, cl, 0, cr, pright, child_no); opright = pright; pright = bin_dup->rec; } else { bin_dup = NULL; pleft = bin->rec; } if (!bin->rng) { /* Empty bin */ printf("%*sEMPTY range\n", level*4, ""); bin->start_used = bin->end_used = 0; bin->flags |= BIN_BIN_UPDATED; if (bin_dup) { bin_dup->start_used = bin_dup->end_used = 0; bin_dup->flags |= BIN_BIN_UPDATED; } } else if (NMIN(bin->start_used, bin->end_used) >= pos) { /* Move range to right contig */ printf("%*sDUP %"PRIrec", MOVE Array to right\n", level*4, "", bin_dup->rec); bin_dup->rng = bin->rng; bin_dup->rng_rec = bin->rng_rec; bin_dup->rng_free = bin->rng_free; if (bin_dup->rng_rec) bin_dup->flags |= BIN_RANGE_UPDATED; if (bin->rec != bin_dup->rec) { bin->rng = NULL; bin->rng_rec = 0; bin->rng_free = -1; bin->flags |= BIN_BIN_UPDATED; } bin->start_used = bin->end_used = 0; break_contig_reparent_seqs(io, bin_dup); if (bin_dup->rng) { int n = ArrayMax(bin_dup->rng); for (i = j = 0; i < n; i++) { range_t *r = arrp(range_t, bin_dup->rng, i), *r2; if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) { HacheData hd; hd.i = 1; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL); j++; } } bin_incr_nseq(io, bin_dup, j); } } else if (NMAX(bin->start_used, bin->end_used) < pos) { /* Range array already in left contig, so do nothing */ printf("%*sMOVE Array to left\n", level*4, ""); if (bin_dup) bin_dup->start_used = bin_dup->end_used = 0; if (bin->rng) { int n = ArrayMax(bin->rng); for (i = j = 0; i < n; i++) { range_t *r = arrp(range_t, bin->rng, i); if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) { HacheData hd; hd.i = 0; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL); j++; } } bin_incr_nseq(io, bin, j); } } else { /* Range array covers pos, so split in two */ int n, nl = 0, nr = 0; int lmin = bin->size, lmax = 0, rmin = bin->size, rmax = 0; printf("%*sDUP %"PRIrec", SPLIT array\n", level*4, "", bin_dup->rec); bin->flags |= BIN_RANGE_UPDATED; bin_dup->flags |= BIN_RANGE_UPDATED; bin_dup->rng = ArrayCreate(sizeof(range_t), 0); bin_dup->rng_free = -1; /* Pass 1 - hash sequences */ n = ArrayMax(bin->rng); for (i = 0; i < n; i++) { range_t *r = arrp(range_t, bin->rng, i); int cstart; /* clipped sequence positions */ seq_t *s; if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) continue; s = (seq_t *)cache_search(io, GT_Seq, r->rec); if ((s->len < 0) ^ complement) { cstart = NMAX(r->start, r->end) - (s->right-1); } else { cstart = NMIN(r->start, r->end) + s->left-1; } if (cstart >= pos) { HacheData hd; hd.i = 1; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL); } else { HacheData hd; hd.i = 0; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL); } } /* Pass 2 - do the moving of anno/seqs */ n = ArrayMax(bin->rng); for (i = j = 0; i < n; i++) { range_t *r = arrp(range_t, bin->rng, i), *r2; int cstart; /* clipped sequence positions */ if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) { cstart = NMAX(r->start, r->end); } else { seq_t *s = (seq_t *)cache_search(io, GT_Seq, r->rec); if ((s->len < 0) ^ complement) { cstart = NMAX(r->start, r->end) - (s->right-1); } else { cstart = NMIN(r->start, r->end) + s->left-1; } } if (cstart >= pos && ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)) { anno_ele_t *a = (anno_ele_t *)cache_search(io, GT_AnnoEle, r->rec); /* If it's an annotation on a sequence < pos then we * still don't move. * * FIXME: we have no guarantee that the sequence being * annotated is in the same bin as this annotation, as * they may be different sizes and end up in different * bins. (Should we enforce anno always in same bin as seq? * If so, consensus annos fit anywhere?) */ if (a->obj_type == GT_Seq) { HacheItem *hi = HacheTableSearch(h, (char *)&r->pair_rec, sizeof(r->pair_rec)); if (hi) { if (hi->data.i == 0) cstart = pos-1; } else { puts("FIXME: annotation for seq in unknown place - " "work out correct location and move if needed."); } } } if (cstart >= pos) { r2 = (range_t *)ArrayRef(bin_dup->rng, ArrayMax(bin_dup->rng)); *r2 = *r; if (rmin > r->start) rmin = r->start; if (rmin > r->end) rmin = r->end; if (rmax < r->start) rmax = r->start; if (rmax < r->end) rmax = r->end; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ) nr++; } else { if (lmin > r->start) lmin = r->start; if (lmin > r->end) lmin = r->end; if (lmax < r->start) lmax = r->start; if (lmax < r->end) lmax = r->end; if (j != i) { r2 = arrp(range_t, bin->rng, j); *r2 = *r; } j++; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ) nl++; } } bin_incr_nseq(io, bin, nl); bin_incr_nseq(io, bin_dup, nr); ArrayMax(bin->rng) = j; #if 0 /* * Right now this causes problems, but I'm not sure why. Try again * after we've fixed the bin->nseqs issues and other deallocation * woes. */ if (ArrayMax(bin_dup->rng) == 0 && bin_dup->parent_type == GT_Bin) { /* We didn't need it afterall! Odd. */ bin_index_t *pb; printf("Purging bin %d that we didn't need afterall\n", bin_dup->rec); cache_rec_deallocate(io, GT_Bin, bin_dup->rec); pb = cache_search(io, GT_Bin, bin_dup->parent); if (pb->child[0] == bin_dup->rec) pb->child[0] = 0; if (pb->child[1] == bin_dup->rec) pb->child[1] = 0; bin_dup = NULL; pright = opright; } #endif if (bin_dup) break_contig_reparent_seqs(io, bin_dup); if (lmin < lmax) { bin->start_used = lmin; bin->end_used = lmax; } else { /* No data left in bin */ bin->start_used = 0; bin->end_used = 0; } printf("%*sLeft=>%d..%d right=>%d..%d\n", level*4, "", lmin, lmax, rmin, rmax); if (bin_dup) { if (rmin < rmax) { bin_dup->start_used = rmin; bin_dup->end_used = rmax; } else { /* No data moved in bin */ bin_dup->start_used = 0; bin_dup->end_used = 0; } } } /* Recurse */ for (i = 0; i < 2; i++) { bin_index_t *ch; if (!bin->child[i]) continue; ch = get_bin(io, bin->child[i]); if (0 != break_contig_recurse(io, h, cl, cr, bin->child[i], pos, NMIN(ch->pos, ch->pos + ch->size-1), level+1, pleft, pright, i, complement)) return -1; } cache_decr(io, bin); // if (bin_dup) // cache_decr(io, bin_dup); return 0; }